[Mlir-commits] [mlir] 90ecfa2 - [mlir][linalg] NFC - Move some utils in preparation for revamping mapping of scf.forall

Mon Jul 24 16:20:06 PDT 2023

Author: Nicolas Vasilache
Date: 2023-07-25T01:19:57+02:00
New Revision: 90ecfa2a408fd5a4486cf0f4a66dead6c586a1d7

URL: https://github.com/llvm/llvm-project/commit/90ecfa2a408fd5a4486cf0f4a66dead6c586a1d7
DIFF: https://github.com/llvm/llvm-project/commit/90ecfa2a408fd5a4486cf0f4a66dead6c586a1d7.diff

LOG: [mlir][linalg] NFC - Move some utils in preparation for revamping mapping of scf.forall

Added: 
    mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
    mlir/lib/Dialect/GPU/TransformOps/Utils.cpp

Modified: 
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
    mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
    mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
    mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
    mlir/lib/Dialect/Utils/StaticValueUtils.cpp
    utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
index 3b10fcb77aaf39..a1cfa406c60ceb 100644

--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -31,51 +31,7 @@ namespace mlir {
 class DialectRegistry;
 namespace transform {
 namespace gpu {
-
-/// Helper type for functions that generate ids for the mapping of a
-/// scf.forall.
-struct IdBuilderResult {
-  // Ops used to replace the forall induction variables.
-  SmallVector<Value> mappingIdOps;
-  // Actual mapping sizes used to predicate the forall body when they are
-  // smaller than the available mapping sizes.
-  SmallVector<int64_t> predicateMappingSizes;
-  // Ops used to predicate the forall body when predicateMappingSizes is smaller
-  // than the available mapping sizes.
-  SmallVector<Value> predicateIdOps;
-};
-
-/// Common gpu id builder type, allows the configuration of lowering for various
-/// mapping schemes. Takes:
-///   - A rewriter with insertion point set before the forall op to rewrite.
-///   - The loc of the forall op to rewrite.
-///   - A list of positive integers carrying the mapping sizes for the current
-///     forall op to rewrite.
-using GpuIdBuilderFnType =
-    std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;
-
-/// Helper struct for configuring the rewrite of mapped scf.forall ops to
-/// various gpu id configurations.
-struct GpuIdBuilder {
-  GpuIdBuilder(ArrayRef<OpFoldResult> blockDims, ArrayRef<int64_t> mappingSizes)
-      : blockDimsOfr(blockDims), availableMappingSizes(mappingSizes),
-        mappingAttributes(), idBuilder() {}
-
-  /// List of OpFoldResult carrying the  multi-dimensional number of
-  /// threads available in the current kernel (i.e. the current blockDims in
-  /// CUDA parlance).
-  ArrayRef<OpFoldResult> blockDimsOfr;
-
-  /// A list of positive integers carrying the number of available mapping
-  /// resources that can trigger predication,
-  ArrayRef<int64_t> availableMappingSizes;
-
-  /// The mapping attributes targeted by this generator.
-  SmallVector<DeviceMappingAttrInterface> mappingAttributes;
-
-  /// The constructor that builds the concrete IR for mapping ids.
-  GpuIdBuilderFnType idBuilder;
-};
+struct GpuIdBuilder;
 
 /// Map the top level `scf.forall` op to GPU Thread Blocks.
 /// Mapping is one-to-one and the induction variables of `scf.forall` are
@@ -121,11 +77,6 @@ DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
     Operation *target, ArrayRef<int64_t> blockDimsOfr,
     ArrayRef<int64_t> warpDims, bool syncAfterDistribute);
 
-/// Find the unique top level scf::ForallOp within a given target op.
-DiagnosedSilenceableFailure
-findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
-                     TransformOpInterface transformOp);
-
 } // namespace gpu
 } // namespace transform
 

diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
new file mode 100644
index 00000000000000..ac10f5c5008eff
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
@@ -0,0 +1,157 @@
+//===- Utils.h - Utils for GPU transform ops --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
+#define MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gpu {
+class GPUOp;
+class LaunchOp;
+enum class MappingId : uint64_t;
+} // namespace gpu
+namespace scf {
+class ForallOp;
+} // namespace scf
+namespace transform {
+namespace gpu {
+
+/// Helper type for functions that generate ids for the mapping of a
+/// scf.forall.
+struct IdBuilderResult {
+  // Ops used to replace the forall induction variables.
+  SmallVector<Value> mappingIdOps;
+  // Actual mapping sizes used to predicate the forall body when they are
+  // smaller than the available mapping sizes.
+  SmallVector<int64_t> predicateMappingSizes;
+  // Ops used to predicate the forall body when predicateMappingSizes is smaller
+  // than the available mapping sizes.
+  SmallVector<Value> predicateIdOps;
+};
+
+/// Common gpu id builder type, allows the configuration of lowering for various
+/// mapping schemes. Takes:
+///   - A rewriter with insertion point set before the forall op to rewrite.
+///   - The loc of the forall op to rewrite.
+///   - A list of positive integers carrying the mapping sizes for the current
+///     forall op to rewrite.
+using GpuIdBuilderFnType =
+    std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;
+
+/// Helper struct for configuring the rewrite of mapped scf.forall ops to
+/// various gpu id configurations.
+struct GpuIdBuilder {
+  GpuIdBuilder(ArrayRef<OpFoldResult> blockDims, ArrayRef<int64_t> mappingSizes)
+      : blockDimsOfr(blockDims), availableMappingSizes(mappingSizes),
+        mappingAttributes(), idBuilder() {}
+
+  /// List of OpFoldResult carrying the  multi-dimensional number of
+  /// threads available in the current kernel (i.e. the current blockDims in
+  /// CUDA parlance).
+  ArrayRef<OpFoldResult> blockDimsOfr;
+
+  /// A list of positive integers carrying the number of available mapping
+  /// resources that can trigger predication,
+  ArrayRef<int64_t> availableMappingSizes;
+
+  /// The mapping attributes targeted by this generator.
+  SmallVector<DeviceMappingAttrInterface> mappingAttributes;
+
+  /// The constructor that builds the concrete IR for mapping ids.
+  GpuIdBuilderFnType idBuilder;
+};
+
+/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuBlockIdBuilder : public GpuIdBuilder {
+  GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                    ArrayRef<int64_t> mappingSizes);
+};
+
+/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without
+/// any reindexing.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuThreadIdBuilder : public GpuIdBuilder {
+  GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                     ArrayRef<int64_t> mappingSizes);
+};
+
+/// Builder for warp ids used in mapping scf.forall to warps.
+/// This builder requires a specification of the number of warps along each
+/// dimension to more finely control mapping to warps as well a predication than
+/// by solely analyzing the IR.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuWarpIdBuilder : public GpuIdBuilder {
+  GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                   ArrayRef<int64_t> mappingSizes);
+  /// Static specification of the warp size.
+  /// In the future this may be configured by the transformation.
+  static constexpr int64_t kWarpSize = 32;
+};
+
+/// Builder for linear ids used in mapping scf.forall to reindexed threads.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 1-D sizes for predicate generation.
+struct GpuLinearIdBuilder : public GpuIdBuilder {
+  GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                     ArrayRef<int64_t> mappingSizes);
+};
+
+/// Determine if the size of the kernel configuration is supported by the
+/// GPU architecture being used.
+/// TODO this is currently hardwired to CUDA, parameterize and generalize.
+DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
+                                           std::optional<int64_t> gridDimX,
+                                           std::optional<int64_t> gridDimY,
+                                           std::optional<int64_t> gridDimZ,
+                                           std::optional<int64_t> blockDimX,
+                                           std::optional<int64_t> blockDimY,
+                                           std::optional<int64_t> blockDimZ);
+
+/// Create an empty-body gpu::LaunchOp using the provided kernel settings
+/// and put a terminator within.
+DiagnosedSilenceableFailure
+createGpuLaunch(RewriterBase &rewriter, Location loc,
+                TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp,
+                std::optional<int64_t> gridDimX = std::nullopt,
+                std::optional<int64_t> gridDimY = std::nullopt,
+                std::optional<int64_t> gridDimZ = std::nullopt,
+                std::optional<int64_t> blockDimX = std::nullopt,
+                std::optional<int64_t> blockDimY = std::nullopt,
+                std::optional<int64_t> blockDimZ = std::nullopt);
+
+/// Alter kernel configuration of the given kernel.
+DiagnosedSilenceableFailure
+alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch,
+               TransformOpInterface transformOp,
+               std::optional<int64_t> gridDimX = std::nullopt,
+               std::optional<int64_t> gridDimY = std::nullopt,
+               std::optional<int64_t> gridDimZ = std::nullopt,
+               std::optional<int64_t> blockDimX = std::nullopt,
+               std::optional<int64_t> blockDimY = std::nullopt,
+               std::optional<int64_t> blockDimZ = std::nullopt);
+
+/// Find the unique top level scf::ForallOp within a given target op.
+DiagnosedSilenceableFailure
+findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
+                     TransformOpInterface transformOp);
+
+} // namespace gpu
+} // namespace transform
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H

diff  --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
index 8c9b5e567f6699..b8f7a26ab6a178 100644
--- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
@@ -84,6 +84,9 @@ SmallVector<OpFoldResult> getAsIndexOpFoldResult(MLIRContext *ctx,
 
 /// If ofr is a constant integer or an IntegerAttr, return the integer.
 std::optional<int64_t> getConstantIntValue(OpFoldResult ofr);
+/// If all ifs are constant integers or IntegerAttrs, return the integers.
+std::optional<SmallVector<int64_t>>
+getConstantIntValues(ArrayRef<OpFoldResult> ofrs);
 
 /// Return true if `ofr` is constant integer equal to `value`.
 bool isConstantIntValue(OpFoldResult ofr, int64_t value);

diff  --git a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
index a2c07fa213ffd9..73b77343ef5073 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRGPUTransformOps
   GPUTransformOps.cpp
+  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/TransformOps

diff  --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 9182ccd96e84a8..07470c24ae2d69 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/GPU/TransformOps/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -622,204 +623,6 @@ void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
 // Block and thread mapping utilities.
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-/// Return a flattened thread id for the workgroup with given sizes.
-static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
-                                 ArrayRef<OpFoldResult> blockDimsOfr) {
-  LLVM_DEBUG(llvm::interleaveComma(
-                 blockDimsOfr,
-                 DBGS() << "----buildLinearThreadId with blockDimsOfr:  ");
-             llvm::dbgs() << "\n");
-  assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes");
-  AffineExpr tx, ty, tz, BDX, BDY;
-  bindDims(rewriter.getContext(), tx, ty, tz);
-  bindSymbols(rewriter.getContext(), BDX, BDY);
-  IndexType indexType = rewriter.getIndexType();
-  SmallVector<OpFoldResult> threadsAndWorkGroups{
-      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
-      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
-      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
-  threadsAndWorkGroups.push_back(blockDimsOfr[0]);
-  threadsAndWorkGroups.push_back(blockDimsOfr[1]);
-  OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
-      rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
-  return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
-}
-
-/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 3-D sizes for predicate generation.
-struct GpuBlockIdBuilder : public GpuIdBuilder {
-
-  GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
-                    ArrayRef<int64_t> mappingSizes)
-      : GpuIdBuilder(blockDims, mappingSizes) {
-    mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
-                         GPUBlockMappingAttr::get(ctx, Blocks::DimY),
-                         GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
-    idBuilder = [](RewriterBase &rewriter, Location loc,
-                   ArrayRef<int64_t> forallMappingSizes) {
-      IndexType indexType = rewriter.getIndexType();
-      SmallVector<Value> ids{
-          rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
-          rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
-          rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
-      // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
-      // predicate generation.
-      return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
-                             ids};
-    };
-  }
-};
-
-/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without
-/// any reindexing.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 3-D sizes for predicate generation.
-struct GpuThreadIdBuilder : public GpuIdBuilder {
-  GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
-                     ArrayRef<int64_t> mappingSizes)
-      : GpuIdBuilder(blockDims, mappingSizes) {
-    mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
-                         GPUThreadMappingAttr::get(ctx, Threads::DimY),
-                         GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
-    idBuilder = [](RewriterBase &rewriter, Location loc,
-                   ArrayRef<int64_t> forallMappingSizes) {
-      IndexType indexType = rewriter.getIndexType();
-      SmallVector<Value> ids{
-          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
-          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
-          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
-      // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
-      // predicate generation.
-      return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
-                             ids};
-    };
-  }
-};
-
-/// Builder for warp ids used in mapping scf.forall to warps.
-/// This builder requires a specification of the number of warps along each
-/// dimension to more finely control mapping to warps as well a predication than
-/// by solely analyzing the IR.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 3-D sizes for predicate generation.
-struct GpuWarpIdBuilder : public GpuIdBuilder {
-  GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
-                   ArrayRef<int64_t> mappingSizes)
-      : GpuIdBuilder(blockDims, mappingSizes) {
-    mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
-                         GPUWarpMappingAttr::get(ctx, Warps::DimY),
-                         GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
-    idBuilder = [this](RewriterBase &rewriter, Location loc,
-                       ArrayRef<int64_t> forallMappingSizes) {
-      // Build the linear warp id and decompose it in the basis of
-      // `forallMappingSizes`.
-      Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
-      AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
-      OpFoldResult warpIdOfr = affine::makeComposedFoldedAffineApply(
-          rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
-      Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
-      // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
-      // "row-major" order.
-      SmallVector<int64_t> reverseBasisSizes(
-          llvm::reverse(this->availableMappingSizes));
-      SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
-      SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
-      SmallVector<Value> ids;
-      // Reverse back to be in [x, y, z] order.
-      for (AffineExpr e : llvm::reverse(delinearizingExprs))
-        ids.push_back(
-            affine::makeComposedAffineApply(rewriter, loc, e, {warpId}));
-
-      // clang-format off
-      LDBG("----linearId: " << linearId);
-          LDBG("----warpId: " << warpId);
-      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
-                                       DBGS() << "--delinearization basis: ");
-                 llvm::dbgs() << "\n";
-                 llvm::interleaveComma(strides,
-                                       DBGS() << "--delinearization strides: ");
-                 llvm::dbgs() << "\n";
-                 llvm::interleaveComma(delinearizingExprs,
-                                       DBGS() << "--delinearization exprs: ");
-                 llvm::dbgs() << "\n";
-                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
-                 llvm::dbgs() << "\n";);
-      // clang-format on
-
-      // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
-      // predicate generation.
-      return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
-                             ids};
-    };
-  }
-
-  /// Static specification of the warp size.
-  /// In the future this may be configured by the transformation.
-  static constexpr int64_t kWarpSize = 32;
-};
-
-/// Builder for linear ids used in mapping scf.forall to reindexed threads.
-/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
-/// as 1-D sizes for predicate generation.
-struct GpuLinearIdBuilder : public GpuIdBuilder {
-  GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
-                     ArrayRef<int64_t> mappingSizes)
-      : GpuIdBuilder(blockDims, mappingSizes) {
-    mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
-                         GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
-                         GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
-    idBuilder = [this](RewriterBase &rewriter, Location loc,
-                       ArrayRef<int64_t> forallMappingSizes) {
-      // Build the linear thread id and decompose it in the basis of
-      // `forallMappingSizes`.
-      Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
-      // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
-      // "row-major" order.
-      SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
-      SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
-      AffineExpr d0;
-      bindDims(rewriter.getContext(), d0);
-      SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
-      SmallVector<Value> ids;
-      // Reverse back to be in [x, y, z] order.
-      for (AffineExpr e : llvm::reverse(delinearizingExprs))
-        ids.push_back(
-            affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
-
-      // clang-format off
-      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
-                                       DBGS() << "--delinearization basis: ");
-                 llvm::dbgs() << "\n";
-                 llvm::interleaveComma(strides,
-                                       DBGS() << "--delinearization strides: ");
-                 llvm::dbgs() << "\n";
-                 llvm::interleaveComma(delinearizingExprs,
-                                       DBGS() << "--delinearization exprs: ");
-                 llvm::dbgs() << "\n";
-                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
-                 llvm::dbgs() << "\n";);
-      // clang-format on
-
-      // Compute and return the 1-D actual mapping size spanned by the linearId,
-      // it will be used to predicate against the linearized total number of
-      // threads.
-      int64_t actualMappingSize = 1;
-      for (int64_t s : forallMappingSizes)
-        actualMappingSize *= s;
-
-      // Return 3-D ids for indexing rewrite and 1-D size and id for
-      // predicate generation.
-      return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
-                             SmallVector<Value>{linearId}};
-    };
-  }
-};
-
-} // namespace
-
 static DiagnosedSilenceableFailure
 definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
                       Operation *target, const Twine &message) {
@@ -905,121 +708,6 @@ verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
   return DiagnosedSilenceableFailure::success();
 }
 
-/// Determines if the size of the kernel configuration is supported by the
-/// GPU architecture being used. It presently makes use of CUDA limitations,
-/// however that aspect may be enhanced for other GPUs.
-static DiagnosedSilenceableFailure checkGpuLimits(
-    TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
-    std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
-    std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
-    std::optional<int64_t> blockDimZ) {
-
-  static constexpr int maxTotalBlockdim = 1024;
-  static constexpr int maxBlockdimx = 1024;
-  static constexpr int maxBlockdimy = 1024;
-  static constexpr int maxBlockdimz = 64;
-  static constexpr int maxTotalGriddim = 2147483647;
-  static constexpr int maxGriddimx = 2147483647;
-  static constexpr int maxGriddimy = 65535;
-  static constexpr int maxGriddimz = 65535;
-
-  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
-          maxTotalBlockdim ||
-      (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
-          maxTotalGriddim ||
-      blockDimX.value_or(1) > maxBlockdimx ||
-      blockDimY.value_or(1) > maxBlockdimy ||
-      blockDimZ.value_or(1) > maxBlockdimz ||
-      gridDimY.value_or(1) > maxGriddimy ||
-      gridDimZ.value_or(1) > maxGriddimz ||
-      gridDimX.value_or(1) > maxGriddimx) {
-    return transformOp.emitSilenceableError()
-           << "Trying to launch a GPU kernel with grid_dims = ("
-           << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
-           << gridDimZ.value_or(1) << ") block_dims = ("
-           << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
-           << blockDimZ.value_or(1) << "). It is larger than the limits.";
-  }
-  return DiagnosedSilenceableFailure::success();
-}
-
-/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
-/// and put a terminator within.
-static DiagnosedSilenceableFailure
-createGpuLaunch(RewriterBase &rewriter, Location loc,
-                TransformOpInterface transformOp, LaunchOp &launchOp,
-                std::optional<int64_t> gridDimX = std::nullopt,
-                std::optional<int64_t> gridDimY = std::nullopt,
-                std::optional<int64_t> gridDimZ = std::nullopt,
-                std::optional<int64_t> blockDimX = std::nullopt,
-                std::optional<int64_t> blockDimY = std::nullopt,
-                std::optional<int64_t> blockDimZ = std::nullopt) {
-  DiagnosedSilenceableFailure diag =
-      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
-                     blockDimY, blockDimZ);
-  if (!diag.succeeded())
-    return diag;
-
-  auto createConst = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
-  };
-  OpBuilder::InsertionGuard guard(rewriter);
-  Value one = createConst(1);
-  Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
-  Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
-  Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
-  Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
-  Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
-  Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
-  launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
-                                       blkSizeX, blkSizeY, blkSizeZ);
-  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
-  rewriter.create<TerminatorOp>(loc);
-  return DiagnosedSilenceableFailure::success();
-}
-
-/// Alter kernel configuration of the given kernel.
-static DiagnosedSilenceableFailure
-alterGpuLaunch(RewriterBase &rewriter, LaunchOp gpuLaunch,
-               TransformOpInterface transformOp,
-               std::optional<int64_t> gridDimX = std::nullopt,
-               std::optional<int64_t> gridDimY = std::nullopt,
-               std::optional<int64_t> gridDimZ = std::nullopt,
-               std::optional<int64_t> blockDimX = std::nullopt,
-               std::optional<int64_t> blockDimY = std::nullopt,
-               std::optional<int64_t> blockDimZ = std::nullopt) {
-  DiagnosedSilenceableFailure diag =
-      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
-                     blockDimY, blockDimZ);
-  if (!diag.succeeded())
-    return diag;
-
-  KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
-  auto createConstValue = [&](int dim) {
-    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
-                                                   dim);
-  };
-
-  if (gridDimX.has_value())
-    gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
-  if (gridDimY.has_value())
-    gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
-  if (gridDimZ.has_value())
-    gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
-  if (blockDimX.has_value())
-    gpuLaunch.getBlockSizeXMutable().assign(
-        createConstValue(blockDimX.value()));
-  if (blockDimY.has_value())
-    gpuLaunch.getBlockSizeYMutable().assign(
-        createConstValue(blockDimY.value()));
-  if (blockDimZ.has_value())
-    gpuLaunch.getBlockSizeZMutable().assign(
-        createConstValue(blockDimZ.value()));
-  return DiagnosedSilenceableFailure::success();
-}
-
 /// Struct to return the result of the rewrite of a forall operation.
 struct ForallRewriteResult {
   SmallVector<int64_t> mappingSizes;
@@ -1223,26 +911,6 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
   return DiagnosedSilenceableFailure::success();
 }
 
-DiagnosedSilenceableFailure
-mlir::transform::gpu::findTopLevelForallOp(Operation *target,
-                                           scf::ForallOp &topLevelForallOp,
-                                           TransformOpInterface transformOp) {
-  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
-    if (forallOp->getParentOfType<scf::ForallOp>())
-      return WalkResult::advance();
-    if (topLevelForallOp)
-      // TODO: Handle multiple forall if they are independent.
-      return WalkResult::interrupt();
-    topLevelForallOp = forallOp;
-    return WalkResult::advance();
-  });
-
-  if (walkResult.wasInterrupted())
-    return transformOp.emitSilenceableError()
-           << "could not find a unique topLevel scf.forall";
-  return DiagnosedSilenceableFailure::success();
-}
-
 DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
     transform::TransformRewriter &rewriter, Operation *target,
     ApplyToEachResultList &results, transform::TransformState &state) {

diff  --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
new file mode 100644
index 00000000000000..3ba9bf1a5a14a4
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -0,0 +1,346 @@
+//===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/TransformOps/Utils.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Debug.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+using namespace mlir::transform;
+using namespace mlir::transform::gpu;
+
+#define DEBUG_TYPE "gpu-transforms"
+
+#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
+
+/// Return a flattened thread id for the workgroup with given sizes.
+static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
+                                 ArrayRef<OpFoldResult> blockDimsOfr) {
+  LLVM_DEBUG(llvm::interleaveComma(
+                 blockDimsOfr,
+                 DBGS() << "----buildLinearThreadId with blockDimsOfr:  ");
+             llvm::dbgs() << "\n");
+  assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes");
+  AffineExpr tx, ty, tz, BDX, BDY;
+  bindDims(rewriter.getContext(), tx, ty, tz);
+  bindSymbols(rewriter.getContext(), BDX, BDY);
+  IndexType indexType = rewriter.getIndexType();
+  SmallVector<OpFoldResult> threadsAndWorkGroups{
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
+  threadsAndWorkGroups.push_back(blockDimsOfr[0]);
+  threadsAndWorkGroups.push_back(blockDimsOfr[1]);
+  OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
+      rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
+  return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
+}
+
+namespace mlir {
+namespace transform {
+namespace gpu {
+
+GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx,
+                                     ArrayRef<OpFoldResult> blockDims,
+                                     ArrayRef<int64_t> mappingSizes)
+    : GpuIdBuilder(blockDims, mappingSizes) {
+  mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
+                       GPUBlockMappingAttr::get(ctx, Blocks::DimY),
+                       GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
+  idBuilder = [](RewriterBase &rewriter, Location loc,
+                 ArrayRef<int64_t> forallMappingSizes) {
+    IndexType indexType = rewriter.getIndexType();
+    SmallVector<Value> ids{
+        rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
+        rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
+        rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
+    // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+    // predicate generation.
+    return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
+  };
+}
+
+GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx,
+                                       ArrayRef<OpFoldResult> blockDims,
+                                       ArrayRef<int64_t> mappingSizes)
+    : GpuIdBuilder(blockDims, mappingSizes) {
+  mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
+                       GPUThreadMappingAttr::get(ctx, Threads::DimY),
+                       GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
+  idBuilder = [](RewriterBase &rewriter, Location loc,
+                 ArrayRef<int64_t> forallMappingSizes) {
+    IndexType indexType = rewriter.getIndexType();
+    SmallVector<Value> ids{
+        rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
+        rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
+        rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
+    // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+    // predicate generation.
+    return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
+  };
+}
+
+GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx,
+                                   ArrayRef<OpFoldResult> blockDims,
+                                   ArrayRef<int64_t> mappingSizes)
+    : GpuIdBuilder(blockDims, mappingSizes) {
+  mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
+                       GPUWarpMappingAttr::get(ctx, Warps::DimY),
+                       GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
+  idBuilder = [this](RewriterBase &rewriter, Location loc,
+                     ArrayRef<int64_t> forallMappingSizes) {
+    // Build the linear warp id and decompose it in the basis of
+    // `forallMappingSizes`.
+    Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+    AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
+    OpFoldResult warpIdOfr = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
+    Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
+    // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
+    // "row-major" order.
+    SmallVector<int64_t> reverseBasisSizes(
+        llvm::reverse(this->availableMappingSizes));
+    SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+    SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+    SmallVector<Value> ids;
+    // Reverse back to be in [x, y, z] order.
+    for (AffineExpr e : llvm::reverse(delinearizingExprs))
+      ids.push_back(
+          affine::makeComposedAffineApply(rewriter, loc, e, {warpId}));
+
+    // clang-format off
+      LDBG("----linearId: " << linearId);
+          LDBG("----warpId: " << warpId);
+      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+                                       DBGS() << "--delinearization basis: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(strides,
+                                       DBGS() << "--delinearization strides: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(delinearizingExprs,
+                                       DBGS() << "--delinearization exprs: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
+                 llvm::dbgs() << "\n";);
+    // clang-format on
+
+    // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+    // predicate generation.
+    return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes}, ids};
+  };
+}
+
+GpuLinearIdBuilder::GpuLinearIdBuilder(MLIRContext *ctx,
+                                       ArrayRef<OpFoldResult> blockDims,
+                                       ArrayRef<int64_t> mappingSizes)
+    : GpuIdBuilder(blockDims, mappingSizes) {
+  mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
+                       GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
+                       GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
+  idBuilder = [this](RewriterBase &rewriter, Location loc,
+                     ArrayRef<int64_t> forallMappingSizes) {
+    // Build the linear thread id and decompose it in the basis of
+    // `forallMappingSizes`.
+    Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+    // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in
+    // "row-major" order.
+    SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
+    SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+    AffineExpr d0;
+    bindDims(rewriter.getContext(), d0);
+    SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+    SmallVector<Value> ids;
+    // Reverse back to be in [x, y, z] order.
+    for (AffineExpr e : llvm::reverse(delinearizingExprs))
+      ids.push_back(
+          affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
+
+    // clang-format off
+      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+                                       DBGS() << "--delinearization basis: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(strides,
+                                       DBGS() << "--delinearization strides: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(delinearizingExprs,
+                                       DBGS() << "--delinearization exprs: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
+                 llvm::dbgs() << "\n";);
+    // clang-format on
+
+    // Compute and return the 1-D actual mapping size spanned by the linearId,
+    // it will be used to predicate against the linearized total number of
+    // threads.
+    int64_t actualMappingSize = 1;
+    for (int64_t s : forallMappingSizes)
+      actualMappingSize *= s;
+
+    // Return 3-D ids for indexing rewrite and 1-D size and id for
+    // predicate generation.
+    return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
+                           SmallVector<Value>{linearId}};
+  };
+}
+
+DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
+                                           std::optional<int64_t> gridDimX,
+                                           std::optional<int64_t> gridDimY,
+                                           std::optional<int64_t> gridDimZ,
+                                           std::optional<int64_t> blockDimX,
+                                           std::optional<int64_t> blockDimY,
+                                           std::optional<int64_t> blockDimZ) {
+
+  // TODO: pass a configuration object to set the limits properly.
+  static constexpr int maxTotalBlockdim = 1024;
+  static constexpr int maxBlockdimx = 1024;
+  static constexpr int maxBlockdimy = 1024;
+  static constexpr int maxBlockdimz = 64;
+  static constexpr int maxTotalGriddim = 2147483647;
+  static constexpr int maxGriddimx = 2147483647;
+  static constexpr int maxGriddimy = 65535;
+  static constexpr int maxGriddimz = 65535;
+
+  if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
+          maxTotalBlockdim ||
+      (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
+          maxTotalGriddim ||
+      blockDimX.value_or(1) > maxBlockdimx ||
+      blockDimY.value_or(1) > maxBlockdimy ||
+      blockDimZ.value_or(1) > maxBlockdimz ||
+      gridDimY.value_or(1) > maxGriddimy ||
+      gridDimZ.value_or(1) > maxGriddimz ||
+      gridDimX.value_or(1) > maxGriddimx) {
+    return transformOp.emitSilenceableError()
+           << "Trying to launch a GPU kernel with grid_dims = ("
+           << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
+           << gridDimZ.value_or(1) << ") block_dims = ("
+           << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
+           << blockDimZ.value_or(1) << "). It is larger than the limits.";
+  }
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure createGpuLaunch(
+    RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
+    LaunchOp &launchOp, std::optional<int64_t> gridDimX,
+    std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
+    std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
+    std::optional<int64_t> blockDimZ) {
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+                     blockDimY, blockDimZ);
+  if (!diag.succeeded())
+    return diag;
+
+  auto createConst = [&](int dim) {
+    return rewriter.create<arith::ConstantIndexOp>(loc, dim);
+  };
+  OpBuilder::InsertionGuard guard(rewriter);
+  Value one = createConst(1);
+  Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
+  Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
+  Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
+  Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
+  Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
+  Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
+  launchOp = rewriter.create<LaunchOp>(loc, gridSizeX, gridSizeY, gridSizeZ,
+                                       blkSizeX, blkSizeY, blkSizeZ);
+  rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
+  rewriter.create<TerminatorOp>(loc);
+  return DiagnosedSilenceableFailure::success();
+}
+
+/// Alter kernel configuration of the given kernel.
+DiagnosedSilenceableFailure alterGpuLaunch(
+    RewriterBase &rewriter, LaunchOp gpuLaunch,
+    TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
+    std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
+    std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
+    std::optional<int64_t> blockDimZ) {
+  DiagnosedSilenceableFailure diag =
+      checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
+                     blockDimY, blockDimZ);
+  if (!diag.succeeded())
+    return diag;
+
+  KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointAfterValue(currentBlockdim.x);
+  auto createConstValue = [&](int dim) {
+    return rewriter.create<arith::ConstantIndexOp>(currentBlockdim.x.getLoc(),
+                                                   dim);
+  };
+
+  if (gridDimX.has_value())
+    gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));
+  if (gridDimY.has_value())
+    gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));
+  if (gridDimZ.has_value())
+    gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));
+  if (blockDimX.has_value())
+    gpuLaunch.getBlockSizeXMutable().assign(
+        createConstValue(blockDimX.value()));
+  if (blockDimY.has_value())
+    gpuLaunch.getBlockSizeYMutable().assign(
+        createConstValue(blockDimY.value()));
+  if (blockDimZ.has_value())
+    gpuLaunch.getBlockSizeZMutable().assign(
+        createConstValue(blockDimZ.value()));
+  return DiagnosedSilenceableFailure::success();
+}
+
+DiagnosedSilenceableFailure
+findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
+                     TransformOpInterface transformOp) {
+  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
+    if (forallOp->getParentOfType<scf::ForallOp>())
+      return WalkResult::advance();
+    if (topLevelForallOp)
+      // TODO: Handle multiple forall if they are independent.
+      return WalkResult::interrupt();
+    topLevelForallOp = forallOp;
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return transformOp.emitSilenceableError()
+           << "could not find a unique topLevel scf.forall";
+  return DiagnosedSilenceableFailure::success();
+}
+
+} // namespace gpu
+} // namespace transform
+} // namespace mlir

diff  --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 7db793b766a1b1..2e0bafb4fc6545 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -121,6 +121,20 @@ std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
   return std::nullopt;
 }
 
+std::optional<SmallVector<int64_t>>
+getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
+  bool failed = false;
+  SmallVector<int64_t> res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) {
+    auto cv = getConstantIntValue(ofr);
+    if (!cv.has_value())
+      failed = true;
+    return cv.has_value() ? cv.value() : 0;
+  });
+  if (failed)
+    return std::nullopt;
+  return res;
+}
+
 /// Return true if `ofr` is constant integer equal to `value`.
 bool isConstantIntValue(OpFoldResult ofr, int64_t value) {
   auto val = getConstantIntValue(ofr);

diff  --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index d1601a3e5937f3..2d110883f57d39 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4747,12 +4747,12 @@ gentbl_cc_library(
 
 cc_library(
     name = "GPUTransformOps",
-    srcs = [
-        "lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp",
-    ],
-    hdrs = [
-        "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h",
-    ],
+    srcs = glob([
+        "lib/Dialect/GPU/TransformOps/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/GPU/TransformOps/*.h",
+    ]),
     includes = ["include"],
     deps = [
         ":AffineDialect",