[Mlir-commits] [mlir] c59465e - [mlir][Transform] Add support for mapping to GPU warps and to linear ids

Mon Mar 20 01:05:42 PDT 2023

Author: Nicolas Vasilache
Date: 2023-03-20T01:05:32-07:00
New Revision: c59465e1203dd78d06e15f7ddf62141807dbd5a7

URL: https://github.com/llvm/llvm-project/commit/c59465e1203dd78d06e15f7ddf62141807dbd5a7
DIFF: https://github.com/llvm/llvm-project/commit/c59465e1203dd78d06e15f7ddf62141807dbd5a7.diff

LOG: [mlir][Transform] Add support for mapping to GPU warps and to linear ids

This revisions refactors the implementation of mapping to threads to additionally allow warps and linear ids to be specified.

`warp_dims` is currently specified along with `block_dims` as a transform attribute.

Linear ids on th other hand use the flattened block_dims to predicate on the first (linearized) k threads.
An additional GPULinearIdMappingAttr is added to the GPU dialect to allow specifying loops mapped to this new scheme.

Various implementation and transform op semantics cleanups are also applied.

Reviewed By: ThomasRaoux

Differential Revision: https://reviews.llvm.org/D146130

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
    mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
    mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
    mlir/test/Dialect/GPU/transform-gpu-failing.mlir
    mlir/test/Dialect/GPU/transform-gpu.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
index 3b261acdee83a..699390c2f2959 100644

--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td
@@ -64,6 +64,41 @@ def GPUWarpMappingAttr : GPU_Attr<"GPUWarpMapping", "warp", [
   }];
 }
 
+def LinearIdEnum : I64EnumAttr<"LinearId", "linear ids for loop mapping", [
+    DimX, DimY, DimZ]> {
+  let cppNamespace = "::mlir::gpu";
+}
+
+def GPULinearIdMapping : GPU_Attr<"GPULinearIdMapping", "linear", [
+  DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
+  let parameters = (ins
+    EnumParameter<LinearIdEnum>:$linear_id
+  );
+  let assemblyFormat = "`<` params `>`";
+  let description = [{
+    An attribute to allow re-interpreting the linear mapping for threads in GPU
+    devices.
+
+    Threads (aka work item) are grouped into a thread block where block may be
+    described by a 1-, 2-, or 3-dimensional rectangular basis.
+    The linear thread id is obtained by linearizing the 1-, 2- or 3-dimensional
+    index. For instance, if the basis is denoted as (BX, BY, BZ) and the thread
+    id is denoted by (tx, ty, tz), the linear thread id is:
+      `linear_id = tx + ty * BX + tz * BX * BY)`.
+    The linear thread id is fixed for the duration of a GPU kernel.
+    
+    This linear id mapping attribute indicates a 
diff erent linearization relation
+    is applied locally to a loop nest. 
+    
+    For instance, if the new basis is denoted as (LBX, LBY, LBZ) the thread id
+    in the new basis is:
+      `(linear_id mod LBX , (linear_id / LBX) mod * LBY, linear_id / (LBX * LBY))`.
+    This reinterpretation is only fixe for the duration of a loop nest.
+    
+    It can be consumed by lowering to generate GPU code.
+  }];
+}
+
 def BlocksEnum : I64EnumAttr<"Blocks", "threads for loop mapping", [
     DimX, DimY, DimZ]> {
   let cppNamespace = "::mlir::gpu";

diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
index 579922a3a9c03..57d74d856cba7 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -33,33 +33,94 @@ class DialectRegistry;
 namespace transform {
 namespace gpu {
 
+/// Helper type for functions that generate ids for the mapping of a
+/// scf.forall.
+struct IdBuilderResult {
+  // Ops used to replace the forall induction variables.
+  SmallVector<Value> mappingIdOps;
+  // Actual mapping sizes used to predicate the forall body when they are
+  // smaller than the available mapping sizes.
+  SmallVector<int64_t> predicateMappingSizes;
+  // Ops used to predicate the forall body when predicateMappingSizes is smaller
+  // than the available mapping sizes.
+  SmallVector<Value> predicateIdOps;
+};
+
+/// Common gpu id builder type, allows the configuration of lowering for various
+/// mapping schemes. Takes:
+///   - A rewriter with insertion point set before the forall op to rewrite.
+///   - The loc of the forall op to rewrite.
+///   - A list of positive integers carrying the mapping sizes for the current
+///     forall op to rewrite.
+using GpuIdBuilderFnType =
+    std::function<IdBuilderResult(RewriterBase &, Location, ArrayRef<int64_t>)>;
+
+/// Helper struct for configuring the rewrite of mapped scf.forall ops to
+/// various gpu id configurations.
+struct GpuIdBuilder {
+  GpuIdBuilder(ArrayRef<OpFoldResult> blockDims, ArrayRef<int64_t> mappingSizes)
+      : blockDimsOfr(blockDims), availableMappingSizes(mappingSizes),
+        mappingAttributes(), idBuilder() {}
+
+  /// List of OpFoldResult carrying the  multi-dimensional number of
+  /// threads available in the current kernel (i.e. the current blockDims in
+  /// CUDA parlance).
+  ArrayRef<OpFoldResult> blockDimsOfr;
+
+  /// A list of positive integers carrying the number of available mapping
+  /// resources that can trigger predication,
+  ArrayRef<int64_t> availableMappingSizes;
+
+  /// The mapping attributes targeted by this generator.
+  SmallVector<DeviceMappingAttrInterface> mappingAttributes;
+
+  /// The constructor that builds the concrete IR for mapping ids.
+  GpuIdBuilderFnType idBuilder;
+};
+
 /// Map the top level `scf.forall` op to GPU Thread Blocks.
 /// Mapping is one-to-one and the induction variables of `scf.forall` are
-/// rewritten to gpu.block_id according to the thread_dim_apping attribute.
+/// rewritten to gpu.block_id according to the thread_dim_mapping attribute.
+///
 /// Dynamic, `scf.forall` trip counts are currently not supported.
 /// Dynamic block dim sizes are currently not supported.
-DiagnosedSilenceableFailure mapForallToBlocksImpl(
-    RewriterBase &rewriter, TransformOpInterface transformOp,
-    scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
-    const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes,
-    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
-        blockIdGenerator);
-
-/// Search `scf.forall` ops nested under `target` and map each such op to GPU
-/// threads. Mapping is one-to-one and the induction variables of `scf.forall`
-/// are rewritten to gpu.thread_id according to the thread_dim_mapping
-/// attribute.
-/// Sibling `scf.forall` are supported in which case, the union of the number of
-/// threads is computed and may result in predication.
+DiagnosedSilenceableFailure
+mapForallToBlocksImpl(RewriterBase &rewriter, TransformOpInterface transformOp,
+                      scf::ForallOp forallOp,
+                      SmallVectorImpl<int64_t> &gridDims,
+                      const GpuIdBuilder &gpuIdBuilder);
+
+/// Search `scf.forall` ops nested under `target` and map each such op to an
+/// explicit GPU implementation along `availableMappingSizes`.
+/// The mapping is one-to-one and the induction variables of `scf.forall` are
+/// rewritten to gpuIdBuilder.idBuilder according to the
+/// gpuIdBuilder.mappingAttributes attribute.
+///
 /// Dynamic, `scf.forall` trip counts are currently not supported.
-/// Dynamic block dim sizes are currently not supported.
+/// Dynamic `availableMappingSizes` sizes are currently not supported.
+/// `availableMappingSizes` is expected to be of size 3.
+DiagnosedSilenceableFailure mapOneForallToThreadsImpl(
+    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
+    scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
+    bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder);
+
+/// Search `scf.forall` ops nested under `target` and map each such op to an
+/// explicit GPU implementation along blockDims and warpDims.
+/// The mapping is one-to-one and the induction variables of `scf.forall` are
+/// rewritten to threads and warps ids according to the mapping attribute.
+///
+/// Dynamic, `scf.forall` trip counts are currently not supported.
+/// Dynamic `blockDims` or `warpDims` or `linearDims` sizes are currently not
+/// supported.
+/// `blockDims` is expected to be of size 3.
+/// `warpDims` is expected to be empty or of size 3.
+///
+/// The insertion point of the `rewriter` is expected to be set at the
+/// beginning of the `target` body block and dominate all other blocks.
 DiagnosedSilenceableFailure mapNestedForallToThreadsImpl(
     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
-    Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,
-    bool syncAfterDistribute,
-    const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes,
-    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
-        threadIdGenerator);
+    Operation *target, ArrayRef<int64_t> blockDimsOfr,
+    ArrayRef<int64_t> warpDims, bool syncAfterDistribute);
 
 /// Find the unique top level scf::ForallOp within a given target op.
 DiagnosedSilenceableFailure

diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
index 46f0e186741e8..c719fedc90e33 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -22,21 +22,26 @@ def MapNestedForallToThreads :
      TransformEachOpTrait,
      TransformOpInterface]> {
   let description = [{
-      Target the `gpu.launch op` and rewrite all `scf.forall`
-      nested in it to distributed `gpu.thread_id` attribute.
-
-      The operation searches for `scf.forall` ops nested under `target`
-      and maps each such op to GPU threads. Mapping is one-to-one and the
-      induction variables of `scf.forall` are rewritten to
-      `gpu.thread_id` according to the `mapping` attribute.
-
-      Sibling `scf.forall` are supported in which case, the union of
-      the number of threads is computed and may result in predication.
-
-      Multiple scf.forall are supported per `gpu.launch` in which case,
-      the max of all the threads is computed and taken for the global
-      `gpu.thread_id`. If necessary, `scf.forall` that do not use the
-      whole thread range result in predicated computations.
+      Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to 
+      distributed `gpu.thread_id` attribute.
+
+      The operation searches for `scf.forall` ops nested under `target` and maps
+      each such op to GPU threads. 
+      
+      `scf.forall` induction variables are rewritten to `gpu.thread_id` according
+      to the `mapping` attribute.
+
+      Different types of mappings attributes are supported:
+        - the block_dims is a list of integers that specifies the number of
+          threads in each dimension. This is a mandatory attribute that is used
+          to constrain the number of threads in each dimension. If an 
+          `scf.forall` op is mapped to fewer threads, predication occurs.
+        - the warp_dims is a list of integers that specifies the number of
+          warps in each dimension. This is an optional attribute that is used
+          to constrain the number of warps in each dimension. When present, this
+          attribute must be specified in a way that is compatible with the 
+          block_dims attribute. If an `scf.forall` op is mapped to fewer warps,
+          predicaiton occurs.
 
       Dynamic `scf.forall` trip counts are currently not supported.
       Dynamic block dim sizes are currently not supported.
@@ -45,10 +50,12 @@ def MapNestedForallToThreads :
       Only `scf.forall` distributed to **at most 3 dimensions** are
       currently supported.
 
-      Barriers are inserted after each scf.forall op for now.
+      The `sync_after_distribute`attribute controls whether a `gpu.barrier` is
+      inserted after each scf.forall op. At this time, this is an all or nothing
+      choice. This will need to be tightened in the future.
 
-      The operation alters the block size of the given gpu_launch using
-      blockDim argument.
+      The operation alters the block size of the given gpu_launch using the 
+      mandatory block_dims argument.
 
       #### Return modes:
 
@@ -83,6 +90,7 @@ def MapNestedForallToThreads :
         gpu.terminator
       }
       ```
+
       is translated to:
 
       ```
@@ -104,11 +112,18 @@ def MapNestedForallToThreads :
     }];
 
   let arguments = (ins PDL_Operation:$target,
-                   DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
-                   DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
+                   DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$block_dims,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
+                   DefaultValuedAttr<BoolAttr, "true">:$sync_after_distribute);
   let results = (outs PDL_Operation:$result);
 
-  let assemblyFormat = "$target attr-dict";
+  let assemblyFormat = [{
+    $target
+    `block_dims` `=` $block_dims
+    (`warp_dims` `=` $warp_dims^)?
+    (`sync_after_distribute` `=` $sync_after_distribute^)?
+    attr-dict
+  }];
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
         ::mlir::Operation *target,
@@ -117,7 +132,6 @@ def MapNestedForallToThreads :
   }];
 }
 
-
 def MapForallToBlocks :
   Op<Transform_Dialect, "gpu.map_forall_to_blocks",
     [FunctionalStyleTransformOpTrait,
@@ -142,8 +156,8 @@ def MapForallToBlocks :
     Only scf.forall distributed to **at most 3 dimensions** are
     currently supported.
 
-    The operation alters the block size of the given gpu_launch using
-    gridDim argument.
+    The operation alters the block size of the given gpu_launch using the 
+    grid_dims argument.
 
     #### Return modes:
 
@@ -162,11 +176,16 @@ def MapForallToBlocks :
   }];
 
   let arguments = (ins PDL_Operation:$target,
-                   DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$grid_dims,
                    UnitAttr:$generate_gpu_launch);
   let results = (outs PDL_Operation:$result);
 
-  let assemblyFormat = "$target attr-dict";
+  let assemblyFormat = [{
+    $target
+    (`generate_gpu_launch` $generate_gpu_launch^)?
+    (`grid_dims` `=` $grid_dims^)?
+    attr-dict
+  }];
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
         ::mlir::Operation *target,

diff  --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 0ec5877f80361..f9d929d163445 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -46,6 +46,10 @@ int64_t GPUWarpMappingAttr::getMappingId() const {
   return static_cast<int64_t>(getWarp());
 }
 
+int64_t GPULinearIdMappingAttr::getMappingId() const {
+  return static_cast<int64_t>(getLinearId());
+}
+
 int64_t GPUThreadMappingAttr::getMappingId() const {
   return static_cast<int64_t>(getThread());
 }

diff  --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 748d9e46ac153..f1559970d36d9 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -8,7 +8,9 @@
 
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
@@ -16,9 +18,14 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Visitors.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,6 +34,7 @@
 using namespace mlir;
 using namespace mlir::gpu;
 using namespace mlir::transform;
+using namespace mlir::transform::gpu;
 
 #define DEBUG_TYPE "gpu-transforms"
 
@@ -35,68 +43,200 @@ using namespace mlir::transform;
 
 namespace {
 
-/// Helper type for functions that generate ids for the mapping of a scf.forall.
-using IdGeneratorFnType = llvm::function_ref<void(RewriterBase &, scf::ForallOp,
-                                                  SmallVectorImpl<Value> &)>;
+/// Return a flattened thread id for the workgroup with given sizes.
+static Value buildLinearThreadId(RewriterBase &rewriter, Location loc,
+                                 ArrayRef<OpFoldResult> blockDimsOfr) {
+  LLVM_DEBUG(llvm::interleaveComma(
+                 blockDimsOfr,
+                 DBGS() << "----buildLinearThreadId with blockDimsOfr:  ");
+             llvm::dbgs() << "\n");
+  assert(blockDimsOfr.size() == 3 && "expected 3 workgroup sizes");
+  AffineExpr tx, ty, tz, BDX, BDY;
+  bindDims(rewriter.getContext(), tx, ty, tz);
+  bindSymbols(rewriter.getContext(), BDX, BDY);
+  IndexType indexType = rewriter.getIndexType();
+  SmallVector<OpFoldResult> threadsAndWorkGroups{
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x).getResult(),
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y).getResult(),
+      rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z).getResult()};
+  threadsAndWorkGroups.push_back(blockDimsOfr[0]);
+  threadsAndWorkGroups.push_back(blockDimsOfr[1]);
+  OpFoldResult ofr = makeComposedFoldedAffineApply(
+      rewriter, loc, tx + ty * BDX + tz * BDX * BDY, threadsAndWorkGroups);
+  return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
+}
 
-struct MappingToGpuHelper {
-  MappingToGpuHelper(SmallVector<DeviceMappingAttrInterface> mappingAttributes,
-                     IdGeneratorFnType idGenerator)
-      : mappingAttributes(mappingAttributes), idGenerator(idGenerator) {}
+/// Builder for gpu::BlockIdOps used in mapping scf.forall to blocks.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuBlockIdBuilder : public GpuIdBuilder {
+
+  GpuBlockIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                    ArrayRef<int64_t> mappingSizes)
+      : GpuIdBuilder(blockDims, mappingSizes) {
+    mappingAttributes = {GPUBlockMappingAttr::get(ctx, Blocks::DimX),
+                         GPUBlockMappingAttr::get(ctx, Blocks::DimY),
+                         GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
+    idBuilder = [](RewriterBase &rewriter, Location loc,
+                   ArrayRef<int64_t> forallMappingSizes) {
+      IndexType indexType = rewriter.getIndexType();
+      SmallVector<Value> ids{
+          rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
+          rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
+          rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)};
+      // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+      // predicate generation.
+      return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
+                             ids};
+    };
+  }
+};
 
-  SmallVector<DeviceMappingAttrInterface> mappingAttributes;
-  IdGeneratorFnType idGenerator;
+/// Builder for gpu::ThreadIdOp used in mapping scf.forall to thread ids without
+/// any reindexing.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuThreadIdBuilder : public GpuIdBuilder {
+  GpuThreadIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                     ArrayRef<int64_t> mappingSizes)
+      : GpuIdBuilder(blockDims, mappingSizes) {
+    mappingAttributes = {GPUThreadMappingAttr::get(ctx, Threads::DimX),
+                         GPUThreadMappingAttr::get(ctx, Threads::DimY),
+                         GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
+    idBuilder = [](RewriterBase &rewriter, Location loc,
+                   ArrayRef<int64_t> forallMappingSizes) {
+      IndexType indexType = rewriter.getIndexType();
+      SmallVector<Value> ids{
+          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
+          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
+          rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
+      // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+      // predicate generation.
+      return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
+                             ids};
+    };
+  }
 };
 
-struct MappingToGpuBlocksHelper : public MappingToGpuHelper {
-
-  MappingToGpuBlocksHelper(MLIRContext *ctx)
-      : MappingToGpuHelper(
-            SmallVector<DeviceMappingAttrInterface>{
-                GPUBlockMappingAttr::get(ctx, Blocks::DimX),
-                GPUBlockMappingAttr::get(ctx, Blocks::DimY),
-                GPUBlockMappingAttr::get(ctx, Blocks::DimZ)},
-            IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,
-                                 SmallVectorImpl<Value> &ids) {
-              OpBuilder::InsertionGuard guard(rewriter);
-              rewriter.setInsertionPoint(forallOp);
-              IndexType indexType = rewriter.getIndexType();
-              auto loc = forallOp->getLoc();
-              ids.assign(
-                  {rewriter.create<BlockIdOp>(loc, indexType, Dimension::x),
-                   rewriter.create<BlockIdOp>(loc, indexType, Dimension::y),
-                   rewriter.create<BlockIdOp>(loc, indexType, Dimension::z)});
-            }}) {}
+/// Builder for warp ids used in mapping scf.forall to warps.
+/// This builder requires a specification of the number of warps along each
+/// dimension to more finely control mapping to warps as well a predication than
+/// by solely analyzing the IR.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 3-D sizes for predicate generation.
+struct GpuWarpIdBuilder : public GpuIdBuilder {
+  GpuWarpIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                   ArrayRef<int64_t> mappingSizes)
+      : GpuIdBuilder(blockDims, mappingSizes) {
+    mappingAttributes = {GPUWarpMappingAttr::get(ctx, Warps::DimX),
+                         GPUWarpMappingAttr::get(ctx, Warps::DimY),
+                         GPUWarpMappingAttr::get(ctx, Warps::DimZ)};
+    idBuilder = [this](RewriterBase &rewriter, Location loc,
+                       ArrayRef<int64_t> forallMappingSizes) {
+      // Build the linear warp id and decompose it in the basis of
+      // `forallMappingSizes`.
+      Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+      AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
+      OpFoldResult warpIdOfr = makeComposedFoldedAffineApply(
+          rewriter, loc, d0.floorDiv(kWarpSize), {linearId});
+      Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr);
+      SmallVector<int64_t> reverseBasisSizes(
+          llvm::reverse(this->availableMappingSizes));
+      SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+      SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+      SmallVector<Value> ids;
+      for (AffineExpr e : delinearizingExprs)
+        ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId));
+
+      // clang-format off
+      LDBG("----linearId: " << linearId);
+          LDBG("----warpId: " << warpId);
+      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+                                       DBGS() << "--delinearization basis: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(strides,
+                                       DBGS() << "--delinearization strides: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(delinearizingExprs,
+                                       DBGS() << "--delinearization exprs: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
+                 llvm::dbgs() << "\n";);
+      // clang-format on
+
+      // Return 3-D ids for indexing rewrite and 3-D sizes and ids for
+      // predicate generation.
+      return IdBuilderResult{ids, SmallVector<int64_t>{forallMappingSizes},
+                             ids};
+    };
+  }
+
+  /// Static specification of the warp size.
+  /// In the future this may be configured by the transformation.
+  static constexpr int64_t kWarpSize = 32;
 };
 
-struct MappingToGpuThreadsHelper : public MappingToGpuHelper {
-  MappingToGpuThreadsHelper(MLIRContext *ctx)
-      : MappingToGpuHelper(
-            SmallVector<DeviceMappingAttrInterface>{
-                GPUThreadMappingAttr::get(ctx, Threads::DimX),
-                GPUThreadMappingAttr::get(ctx, Threads::DimY),
-                GPUThreadMappingAttr::get(ctx, Threads::DimZ)},
-            IdGeneratorFnType{[](RewriterBase &rewriter, scf::ForallOp forallOp,
-                                 SmallVectorImpl<Value> &ids) {
-              OpBuilder::InsertionGuard guard(rewriter);
-              rewriter.setInsertionPoint(forallOp);
-              IndexType indexType = rewriter.getIndexType();
-              auto loc = forallOp->getLoc();
-              ids.assign(
-                  {rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
-                   rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
-                   rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)});
-            }}) {}
+/// Builder for linear ids used in mapping scf.forall to reindexed threads.
+/// The `idBuilder` method returns 3-D values used for indexing rewrites as well
+/// as 1-D sizes for predicate generation.
+struct GpuLinearIdBuilder : public GpuIdBuilder {
+  GpuLinearIdBuilder(MLIRContext *ctx, ArrayRef<OpFoldResult> blockDims,
+                     ArrayRef<int64_t> mappingSizes)
+      : GpuIdBuilder(blockDims, mappingSizes) {
+    mappingAttributes = {GPULinearIdMappingAttr::get(ctx, LinearId::DimX),
+                         GPULinearIdMappingAttr::get(ctx, LinearId::DimY),
+                         GPULinearIdMappingAttr::get(ctx, LinearId::DimZ)};
+    idBuilder = [this](RewriterBase &rewriter, Location loc,
+                       ArrayRef<int64_t> forallMappingSizes) {
+      // Build the linear thread id and decompose it in the basis of
+      // `forallMappingSizes`.
+      Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr);
+      SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
+      SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+      AffineExpr d0;
+      bindDims(rewriter.getContext(), d0);
+      SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+      SmallVector<Value> ids;
+      for (AffineExpr e : delinearizingExprs)
+        ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearId));
+
+      // clang-format off
+      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+                                       DBGS() << "--delinearization basis: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(strides,
+                                       DBGS() << "--delinearization strides: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(delinearizingExprs,
+                                       DBGS() << "--delinearization exprs: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
+                 llvm::dbgs() << "\n";);
+      // clang-format on
+
+      // Compute and return the 1-D actual mapping size spanned by the linearId,
+      // it will be used to predicate against the linearized total number of
+      // threads.
+      int64_t actualMappingSize = 1;
+      for (int64_t s : forallMappingSizes)
+        actualMappingSize *= s;
+
+      // Return 3-D ids for indexing rewrite and 1-D size and id for
+      // predicate generation.
+      return IdBuilderResult{ids, SmallVector<int64_t>{actualMappingSize},
+                             SmallVector<Value>{linearId}};
+    };
+  }
 };
 
 } // namespace
 
 static DiagnosedSilenceableFailure
-failureHelper(std::optional<TransformOpInterface> transformOp,
-              scf::ForallOp forallOp, const Twine &message) {
+definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
+                      Operation *target, const Twine &message) {
   if (transformOp.has_value())
-    return emitDefiniteFailure(*transformOp, message);
-  return emitDefiniteFailure(forallOp, message);
+    return transformOp->emitDefiniteFailure() << message;
+  return emitDefiniteFailure(target, message);
 }
 
 /// Check if given mapping attributes are one of the desired attributes
@@ -104,7 +244,8 @@ static DiagnosedSilenceableFailure
 checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
                            scf::ForallOp forallOp) {
   if (!forallOp.getMapping().has_value())
-    return failureHelper(transformOp, forallOp, "mapping must be present");
+    return definiteFailureHelper(transformOp, forallOp,
+                                 "mapping must be present");
 
   bool hasBlockMapping =
       llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
@@ -114,20 +255,32 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
       llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
         return attr.isa<GPUThreadMappingAttr>();
       });
+  bool hasWarpMapping =
+      llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
+        return attr.isa<GPUWarpMappingAttr>();
+      });
+  bool hasLinearMapping =
+      llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
+        return attr.isa<GPULinearIdMappingAttr>();
+      });
   int64_t countMappingTypes = 0;
   countMappingTypes += hasBlockMapping ? 1 : 0;
   countMappingTypes += hasThreadMapping ? 1 : 0;
+  countMappingTypes += hasWarpMapping ? 1 : 0;
+  countMappingTypes += hasLinearMapping ? 1 : 0;
   if (countMappingTypes > 1) {
-    return failureHelper(transformOp, forallOp,
-                         "cannot mix 
diff erent mapping types, use nesting");
+    return definiteFailureHelper(
+        transformOp, forallOp,
+        "cannot mix 
diff erent mapping types, use nesting");
   }
 
   DenseSet<Attribute> seen;
   for (Attribute map : forallOp.getMapping()->getValue()) {
     if (seen.contains(map)) {
-      return failureHelper(transformOp, forallOp,
-                           "duplicated attribute, cannot map 
diff erent loops "
-                           "to the same processor");
+      return definiteFailureHelper(
+          transformOp, forallOp,
+          "duplicated attribute, cannot map 
diff erent loops "
+          "to the same processor");
     }
     seen.insert(map);
   }
@@ -146,26 +299,26 @@ verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
 
   // Perform other non-types verifications.
   if (!forallOp.isNormalized())
-    return failureHelper(transformOp, forallOp,
-                         "unsupported non-normalized loops");
+    return definiteFailureHelper(transformOp, forallOp,
+                                 "unsupported non-normalized loops");
   if (forallOp.getNumResults() > 0)
-    return failureHelper(transformOp, forallOp,
-                         "only bufferized scf.forall can be mapped");
+    return definiteFailureHelper(transformOp, forallOp,
+                                 "only bufferized scf.forall can be mapped");
   if (forallOp.getRank() > 3)
-    return failureHelper(transformOp, forallOp,
-                         "scf.forall with rank > 3 does not lower");
+    return definiteFailureHelper(transformOp, forallOp,
+                                 "scf.forall with rank > 3 does not lower");
   if (llvm::any_of(forallOp.getMixedUpperBound(), [&](OpFoldResult ofr) {
         return !getConstantIntValue(ofr).has_value();
       })) {
-    return failureHelper(transformOp, forallOp,
-                         "unsupported dynamic sizes in forall op");
+    return definiteFailureHelper(transformOp, forallOp,
+                                 "unsupported dynamic sizes in forall op");
   }
   return DiagnosedSilenceableFailure::success();
 }
 
-/// Determines if the size of the kernel configuration is supported by the GPU
-/// architecture being used. It presently makes use of CUDA limitations, however
-/// that aspect may be enhanced for other GPUs.
+/// Determines if the size of the kernel configuration is supported by the
+/// GPU architecture being used. It presently makes use of CUDA limitations,
+/// however that aspect may be enhanced for other GPUs.
 static DiagnosedSilenceableFailure checkGpuLimits(
     TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
     std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
@@ -192,17 +345,17 @@ static DiagnosedSilenceableFailure checkGpuLimits(
       gridDimZ.value_or(1) > maxGriddimz ||
       gridDimX.value_or(1) > maxGriddimx) {
     return transformOp.emitSilenceableError()
-           << "Trying to launch a GPU kernel with gridDim = ("
+           << "Trying to launch a GPU kernel with grid_dims = ("
            << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "
-           << gridDimZ.value_or(1) << ") blockDim = (" << blockDimX.value_or(1)
-           << ", " << blockDimY.value_or(1) << ", " << blockDimZ.value_or(1)
-           << "). It is larger than the limits.";
+           << gridDimZ.value_or(1) << ") block_dims = ("
+           << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "
+           << blockDimZ.value_or(1) << "). It is larger than the limits.";
   }
   return DiagnosedSilenceableFailure::success();
 }
 
-/// Creates an empty-body gpu::LaunchOp using the provided kernel settings and
-/// put a terminator within.
+/// Creates an empty-body gpu::LaunchOp using the provided kernel settings
+/// and put a terminator within.
 static DiagnosedSilenceableFailure
 createGpuLaunch(RewriterBase &rewriter, Location loc,
                 TransformOpInterface transformOp, LaunchOp &launchOp,
@@ -278,24 +431,36 @@ alterGpuLaunch(IRRewriter &rewriter, LaunchOp gpuLaunch,
   return DiagnosedSilenceableFailure::success();
 }
 
-//===----------------------------------------------------------------------===//
-// MapForallToBlocks
-//===----------------------------------------------------------------------===//
+/// Struct to return the result of the rewrite of a forall operation.
+struct ForallRewriteResult {
+  SmallVector<int64_t> mappingSizes;
+  SmallVector<Value> mappingIds;
+};
 
-static FailureOr<SmallVector<int64_t>> rewriteOneForallCommonImpl(
+/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
+template <typename OpTy, typename OperationOrBlock>
+static void
+replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
+                            OperationOrBlock *parent, Value replacement,
+                            ArrayRef<int64_t> availableMappingSizes) {
+  parent->walk([&](OpTy idOp) {
+    if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
+      rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
+  });
+}
+
+static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
-    scf::ForallOp forallOp,
-    const SmallVectorImpl<int64_t> &availableMappingSizes,
-    const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
-    IdGeneratorFnType idGenerator) {
-  LDBG("Start rewriteOneForallCommonImpl");
+    scf::ForallOp forallOp, ForallRewriteResult &result,
+    ArrayRef<int64_t> availableMappingSizes, const GpuIdBuilder &gpuIdBuilder) {
+  LDBG("--start rewriteOneForallCommonImpl");
 
   // Step 0. GPU-specific verifications. There is no better place to anchor
-  // those right now: the ForallOp is target-independent and the transform op
-  // does not apply to individual ForallOp.
+  // those right now: the ForallOp is target-independent and the transform
+  // op does not apply to individual ForallOp.
   DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);
   if (!diag.succeeded())
-    return failure();
+    return diag;
 
   // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
   SmallVector<int64_t> tmpMappingSizes = llvm::to_vector(
@@ -304,97 +469,108 @@ static FailureOr<SmallVector<int64_t>> rewriteOneForallCommonImpl(
         assert(maybeStaticValue && "expected static value");
         return maybeStaticValue.value();
       }));
-  SmallVector<Attribute> forallMappings =
+  SmallVector<Attribute> forallMappingAttrs =
       llvm::to_vector(forallOp.getMapping()->getValue());
-  for (auto attr : allMappingAttributes) {
-    if (llvm::is_contained(forallMappings, attr))
+  for (auto attr : gpuIdBuilder.mappingAttributes) {
+    if (llvm::is_contained(forallMappingAttrs, attr))
       continue;
-    forallMappings.push_back(attr);
+    forallMappingAttrs.push_back(attr);
     tmpMappingSizes.push_back(1);
   }
+  LLVM_DEBUG(
+      llvm::interleaveComma(
+          tmpMappingSizes,
+          DBGS() << "----tmpMappingSizes extracted from scf.forall op: ");
+      llvm::dbgs() << "\n");
 
   // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
   auto comparator = [&](DeviceMappingAttrInterface a,
                         DeviceMappingAttrInterface b) -> bool {
     return a.getMappingId() < b.getMappingId();
   };
-  SmallVector<int64_t> mappingSizes =
-      getValuesSortedByKey(forallMappings, tmpMappingSizes, comparator);
-  LLVM_DEBUG(llvm::interleaveComma(mappingSizes, DBGS() << "mappingSizes: ");
-             llvm::dbgs() << "\n";
-             llvm::interleaveComma(forallMappings, DBGS() << "mappingAttrs: ");
+  SmallVector<int64_t> forallMappingSizes =
+      getValuesSortedByKey(forallMappingAttrs, tmpMappingSizes, comparator);
+  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
+                                   DBGS() << "----forallMappingSizes: ");
+             llvm::dbgs() << "\n"; llvm::interleaveComma(
+                 forallMappingAttrs, DBGS() << "----mappingAttrs: ");
              llvm::dbgs() << "\n");
 
-  // Step 3. Generate the mappingIdOps using the provided generator and map the
-  // induction variables to the newly created ops. Replace ids of dimension
-  // known to be of size 1 by zero to simplify the IR.
-  SmallVector<Value> mappingIdOps;
+  // Step 3. Generate the mappingIdOps using the provided generator.
   Location loc = forallOp.getLoc();
-  idGenerator(rewriter, forallOp, mappingIdOps);
-  LLVM_DEBUG(llvm::interleaveComma(mappingIdOps, DBGS() << "mappingIdOps: ");
-             llvm::dbgs() << "\n");
-  assert(mappingIdOps.size() == mappingSizes.size() && "expect equal sizes");
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  if (!availableMappingSizes.empty()) {
-    for (size_t i : llvm::seq(size_t(0), availableMappingSizes.size())) {
-      if (availableMappingSizes[i] == 1)
-        mappingIdOps[i] = zero;
-    }
-  }
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(forallOp);
+  IdBuilderResult builderResult =
+      gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes);
 
+  // Step 4. Map the induction variables to the mappingIdOps, this may involve a
+  // permutation.
+  SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
   IRMapping bvm;
   for (auto [iv, dim] :
        llvm::zip_equal(forallOp.getInductionVars(),
-                       ArrayRef<Attribute>{forallMappings}.take_front(
+                       ArrayRef<Attribute>{forallMappingAttrs}.take_front(
                            forallOp.getInductionVars().size()))) {
     Value peIdOp = mappingIdOps[static_cast<int64_t>(
         dim.cast<DeviceMappingAttrInterface>().getMappingId())];
     bvm.map(iv, peIdOp);
   }
 
-  // Step 4. Maybe create conditionals to predicate the region.
-  // Skip this step when availableMappingSizes is empty.
+  // Step 5. If the availableMappingSizes are already known, create conditionals
+  // to predicate the region. Otherwise, the current forall determines the
+  // availableMappingSizes and no predication occurs.
   Value predicate;
   if (!availableMappingSizes.empty()) {
-    LLVM_DEBUG(llvm::interleaveComma(availableMappingSizes,
-                                     DBGS() << "availableMappingSizes: ");
-               llvm::dbgs() << "\n");
-    for (auto [id, mappingSize, availableMappingSize] :
-         llvm::zip_equal(mappingIdOps, mappingSizes, availableMappingSizes)) {
+    SmallVector<int64_t> predicateMappingSizes =
+        builderResult.predicateMappingSizes;
+    SmallVector<Value> predicateIdOps = builderResult.predicateIdOps;
+    // clang-format off
+    LLVM_DEBUG(
+        llvm::interleaveComma(
+          predicateMappingSizes, DBGS() << "----predicateMappingSizes: ");
+        llvm::dbgs() << "\n"; 
+        llvm::interleaveComma(
+          availableMappingSizes, DBGS() << "----availableMappingSizes: ");
+        llvm::dbgs() << "\n";
+        llvm::interleaveComma(predicateIdOps, DBGS() << "----predicateIdOps: ");
+        llvm::dbgs() << "\n");
+    // clang-format on
+    for (auto [id, mappingSize, availableMappingSize] : llvm::zip_equal(
+             predicateIdOps, predicateMappingSizes, availableMappingSizes)) {
       if (mappingSize > availableMappingSize) {
-        (void)failureHelper(
+        return definiteFailureHelper(
             transformOp, forallOp,
             "Trying to map to fewer GPU threads than loop iterations but "
             "overprovisioning is not yet supported. "
             "Try additional tiling of the before mapping or map to more "
             "threads.");
-        return failure();
       }
       if (mappingSize == availableMappingSize)
         continue;
       Value idx = rewriter.create<arith::ConstantIndexOp>(loc, mappingSize);
       Value tmpPredicate = rewriter.create<arith::CmpIOp>(
           loc, arith::CmpIPredicate::ult, id, idx);
-      LDBG("predicate: " << tmpPredicate);
+      LDBG("----predicate: " << tmpPredicate);
       predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
                                                              tmpPredicate)
                             : tmpPredicate;
     }
   }
 
-  // Step 5. Move the body of forallOp.
+  // Step 6. Move the body of forallOp.
   // Erase the terminator first, it will not be used.
   rewriter.eraseOp(forallOp.getTerminator());
   Block *targetBlock;
   Block::iterator insertionPoint;
   if (predicate) {
-    // Step 5.a. If predicated, move at the beginning.
-    auto ifOp =
-        rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
+    // Step 6.a. If predicated, move at the beginning.
+    auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
+                                           /*withElseRegion=*/false);
     targetBlock = ifOp.thenBlock();
     insertionPoint = ifOp.thenBlock()->begin();
   } else {
-    // Step 5.b. Otherwise, move inline just at the rewriter insertion point.
+    // Step 6.b. Otherwise, move inline just at the rewriter insertion
+    // point.
     targetBlock = forallOp->getBlock();
     insertionPoint = rewriter.getInsertionPoint();
   }
@@ -402,32 +578,59 @@ static FailureOr<SmallVector<int64_t>> rewriteOneForallCommonImpl(
   targetBlock->getOperations().splice(insertionPoint,
                                       sourceBlock.getOperations());
 
-  // Step 6. RAUW thread indices to thread ops.
+  // Step 7. RAUW indices.
   for (Value loopIndex : forallOp.getInductionVars()) {
     Value threadIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, threadIdx);
   }
 
-  // Step 7. Erase old op.
+  // Step 8. Erase old op.
   rewriter.eraseOp(forallOp);
 
-  return mappingSizes;
+  result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
+  return DiagnosedSilenceableFailure::success();
 }
 
+//===----------------------------------------------------------------------===//
+// MapForallToBlocks
+//===----------------------------------------------------------------------===//
+
 DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
     RewriterBase &rewriter, TransformOpInterface transformOp,
     scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
-    const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
-    IdGeneratorFnType idGenerator) {
-  // Pass an empty anyAvailableMappingSizes.
+    const GpuIdBuilder &gpuIdBuilder) {
+  LDBG("Start mapForallToBlocksImpl");
+
+  Location loc = forallOp.getLoc();
+  Block *parentBlock = forallOp->getBlock();
+  Value zero;
+  {
+    // Create an early zero index value for replacements and immediately reset
+    // the insertion point.
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPointToStart(parentBlock);
+    zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  }
+
   SmallVector<int64_t> anyAvailableMappingSizes;
-  FailureOr<SmallVector<int64_t>> maybeMappingSizes =
-      rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,
-                                 anyAvailableMappingSizes, allMappingAttributes,
-                                 idGenerator);
-  if (failed(maybeMappingSizes))
-    return DiagnosedSilenceableFailure::definiteFailure();
-  gridDims = *maybeMappingSizes;
+  ForallRewriteResult rewriteResult;
+  // Pass an empty anyAvailableMappingSizes.
+  DiagnosedSilenceableFailure diag =
+      rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult,
+                                 anyAvailableMappingSizes, gpuIdBuilder);
+
+  // Return if anything goes wrong, use silenceable failure as a match failure.
+  if (!diag.succeeded())
+    return diag;
+
+  // Set the gridDims that act as a return.
+  gridDims = rewriteResult.mappingSizes;
+
+  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
+  // Here, the result of mapping determines the available mapping sizes.
+  replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
+                                          gridDims);
+
   return DiagnosedSilenceableFailure::success();
 }
 
@@ -476,7 +679,7 @@ transform::MapForallToBlocks::applyToOne(Operation *target,
     return diag;
   }
 
-  SmallVector<int64_t> gridDims = extractFromI64ArrayAttr(getGridDim());
+  SmallVector<int64_t> gridDims{getGridDims()};
   if (!getGenerateGpuLaunch() && gridDims.size() != 3)
     return transformOp.emitDefiniteFailure("transform require size-3 mapping");
 
@@ -496,17 +699,14 @@ transform::MapForallToBlocks::applyToOne(Operation *target,
     topLevelForallOp = cast<scf::ForallOp>(newForallOp);
   }
 
-  diag = verifyGpuMapping(transformOp, topLevelForallOp);
-  if (!diag.succeeded())
-    return diag;
-
-  MappingToGpuBlocksHelper helper(getContext());
+  GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), {}, {});
   diag = mlir::transform::gpu::mapForallToBlocksImpl(
-      rewriter, transformOp, topLevelForallOp, gridDims,
-      helper.mappingAttributes, helper.idGenerator);
+      rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
   if (!diag.succeeded())
     return diag;
 
+  // Set the GPU launch configuration for the grid dims late, this is subject to
+  // IR inspection.
   diag = alterGpuLaunch(rewriter, gpuLaunch,
                         cast<TransformOpInterface>(getOperation()), gridDims[0],
                         gridDims[1], gridDims[2]);
@@ -519,37 +719,133 @@ transform::MapForallToBlocks::applyToOne(Operation *target,
 // MapNestedForallToThreads
 //===----------------------------------------------------------------------===//
 
+DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
+    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
+    scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
+    bool syncAfterDistribute, const GpuIdBuilder &gpuIdBuilder) {
+  // Ignore cases with 
diff erent attributes than this builder supports.
+  for (Attribute map : forallOp.getMapping()->getValue()) {
+    if (!llvm::is_contained(gpuIdBuilder.mappingAttributes, map)) {
+      LDBG("--skip " << map);
+      LLVM_DEBUG(llvm::interleaveComma(gpuIdBuilder.mappingAttributes,
+                                       DBGS() << "----not in: ");
+                 llvm::dbgs() << "\n";);
+      return emitSilenceableFailure(forallOp);
+    }
+  }
+
+  Location loc = forallOp.getLoc();
+  OpBuilder::InsertionGuard g(rewriter);
+  // Insert after to allow for syncthreads after `forall` is erased.
+  rewriter.setInsertionPointAfter(forallOp);
+  ForallRewriteResult rewriteResult;
+  DiagnosedSilenceableFailure diag =
+      rewriteOneForallCommonImpl(rewriter, transformOp, forallOp, rewriteResult,
+                                 availableMappingSizes, gpuIdBuilder);
+
+  // Return if anything goes wrong, use silenceable failure as a match failure.
+  if (!diag.succeeded())
+    return diag;
+
+  // Add a syncthreads if needed. TODO: warpsync
+  if (syncAfterDistribute)
+    rewriter.create<BarrierOp>(loc);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
 DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
     RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
-    Operation *target, const SmallVectorImpl<int64_t> &kernelBlockDims,
-    bool syncAfterDistribute,
-    const ArrayRef<DeviceMappingAttrInterface> &allMappingAttributes,
-    IdGeneratorFnType idGenerator) {
+    Operation *target, ArrayRef<int64_t> blockDims, ArrayRef<int64_t> warpDims,
+    bool syncAfterDistribute) {
+  LDBG("Start mapNestedForallToThreadsImpl");
+  MLIRContext *ctx = rewriter.getContext();
+  SmallVector<OpFoldResult> blockDimsOfr =
+      getAsIndexOpFoldResult(ctx, blockDims);
+
+  if (blockDims.size() != 3)
+    return definiteFailureHelper(transformOp, target,
+                                 "requires size-3 thread mapping");
+  if (!warpDims.empty()) {
+    if (warpDims.size() != 3)
+      return definiteFailureHelper(transformOp, target,
+                                   "requires empty or size-3 warp mapping");
+  }
+
+  // Create an early zero index value for replacements.
+  Location loc = target->getLoc();
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
   DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
-  target->walk([&](scf::ForallOp forallOp) {
-    // Ignore cases with 
diff erent attributes.
-    for (Attribute map : forallOp.getMapping()->getValue()) {
-      if (!llvm::is_contained(allMappingAttributes, map)) {
+  WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
+    //===--------------------------------------------------------------------===//
+    // Mapping to warp ids.
+    //===--------------------------------------------------------------------===//
+    if (!warpDims.empty()) {
+      LLVM_DEBUG(
+          llvm::interleaveComma(
+              warpDims, DBGS() << "+mapNestedForallToThreadsImpl warpDims: ");
+          llvm::dbgs() << "\n");
+      LLVM_DEBUG(llvm::interleaveComma(
+                     blockDimsOfr, DBGS() << "--warpDims with blockDimsOfr:  ");
+                 llvm::dbgs() << "\n");
+      GpuWarpIdBuilder gpuWarpIdBuilder(ctx, blockDimsOfr, warpDims);
+      diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
+          rewriter, transformOp, forallOp, warpDims, syncAfterDistribute,
+          gpuWarpIdBuilder);
+      // Use silenceable failure to encode "failure to match" and pass
+      // through.
+      if (diag.isDefiniteFailure())
+        return WalkResult::interrupt();
+      if (diag.succeeded())
         return WalkResult::skip();
-      }
-    }
-    diag = verifyGpuMapping(transformOp, forallOp);
-    if (diag.succeeded()) {
-      // Take the loc ahead of time
-      Location loc = forallOp.getLoc();
-      OpBuilder::InsertionGuard g(rewriter);
-      rewriter.setInsertionPointAfter(forallOp);
-      if (failed(rewriteOneForallCommonImpl(rewriter, transformOp, forallOp,
-                                            kernelBlockDims,
-                                            allMappingAttributes, idGenerator)))
-        diag = DiagnosedSilenceableFailure::definiteFailure();
-      // Add a syncthreads if needed. TODO: warpsync
-      if (syncAfterDistribute)
-        rewriter.create<BarrierOp>(loc);
     }
-    return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
+
+    //===--------------------------------------------------------------------===//
+    // Mapping to linear ids.
+    //===--------------------------------------------------------------------===//
+    LDBG("+mapNestedForallToThreadsImpl linearDims");
+    LLVM_DEBUG(llvm::interleaveComma(
+                   blockDimsOfr, DBGS() << "--linearDims with blockDimsOfr:  ");
+               llvm::dbgs() << "\n");
+    int64_t numThreads = 1;
+    for (int64_t b : blockDims)
+      numThreads *= b;
+    GpuLinearIdBuilder gpuLinearIdBuilder(ctx, blockDimsOfr, numThreads);
+    diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
+        rewriter, transformOp, forallOp, numThreads, syncAfterDistribute,
+        gpuLinearIdBuilder);
+    // Use silenceable failure to encode "failure to match" and pass through.
+    if (diag.isDefiniteFailure())
+      return WalkResult::interrupt();
+    if (diag.succeeded())
+      return WalkResult::skip();
+
+    //===--------------------------------------------------------------------===//
+    // Mapping to block ids (happens last so we can replay ThreadIdOp).
+    //===--------------------------------------------------------------------===//
+    LLVM_DEBUG(
+        llvm::interleaveComma(
+            blockDimsOfr, DBGS() << "mapNestedForallToThreadsImpl blockDims: ");
+        llvm::dbgs() << "\n");
+    GpuThreadIdBuilder gpuThreadIdBuilder(ctx, blockDimsOfr, blockDims);
+    diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
+        rewriter, transformOp, forallOp, blockDims, syncAfterDistribute,
+        gpuThreadIdBuilder);
+    // Use silenceable failure to encode "failure to match" and pass through.
+    if (diag.isDefiniteFailure())
+      return WalkResult::interrupt();
+
+    return WalkResult::advance();
   });
-  return diag;
+  if (walkResult.wasInterrupted())
+    return diag;
+
+  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
+  // Here, the result of mapping determines the available mapping sizes.
+  replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
+                                          blockDims);
+
+  return DiagnosedSilenceableFailure::success();
 }
 
 DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
@@ -561,32 +857,29 @@ DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
   if (!gpuLaunch)
     return emitSilenceableError() << "Given target is not a gpu.launch";
 
-  SmallVector<int64_t> blockDims = extractFromI64ArrayAttr(getBlockDim());
-  if (blockDims.size() != 3)
-    return transformOp.emitDefiniteFailure("transform require size-3 mapping");
+  // Mapping to block ids.
+  SmallVector<int64_t> blockDims{getBlockDims()};
 
   DiagnosedSilenceableFailure diag =
       checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
                      blockDims[0], blockDims[1], blockDims[2]);
   if (diag.isSilenceableFailure()) {
-    diag.attachNote(getLoc()) << getBlockDimAttrName() << " is too large";
+    diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
     return diag;
   }
 
-  MLIRContext *ctx = getContext();
-  IRRewriter rewriter(ctx);
-  MappingToGpuThreadsHelper helper(ctx);
-  diag = mlir::transform::gpu::mapNestedForallToThreadsImpl(
-      rewriter, transformOp, target, blockDims, getSyncAfterDistribute(),
-      helper.mappingAttributes, helper.idGenerator);
-
-  if (!diag.succeeded())
-    return diag;
-
+  // Set the GPU launch configuration for the block dims early, this is not
+  // subject to IR inspection.
+  IRRewriter rewriter(getContext());
   diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
                         std::nullopt, std::nullopt, blockDims[0], blockDims[1],
                         blockDims[2]);
 
+  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
+  diag =
+      mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
+                                   getWarpDims(), getSyncAfterDistribute());
+
   results.push_back(gpuLaunch.getOperation());
   return diag;
 }

diff  --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
index 50f49727d3e68..459b800f76d35 100644
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -8,7 +8,7 @@ transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["tensor.empty"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{Given target is not a gpu.launch}}
-  %1 = transform.gpu.map_nested_forall_to_threads %funcop
+  %1 = transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1]
 }
 
 // -----
@@ -47,9 +47,9 @@ func.func @map_nested_forall_to_threads_excessive_threads(%x: memref<2 x 32 x f3
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{Trying to launch a GPU kernel with gridDim = (1, 1, 1) blockDim = (1200, 9, 1). It is larger than the limits.}}
-  // expected-note @below {{"blockDim" is too large}}
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [1200, 9, 1] }
+  // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (1, 1, 1) block_dims = (1200, 9, 1). It is larger than the limits.}}
+  // expected-note @below {{"block_dims" is too large}}
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1200, 9, 1]
 }
 
 // -----
@@ -90,7 +90,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{Trying to map to fewer GPU threads than loop iterations but overprovisioning is not yet supported. Try additional tiling of the before mapping or map to more threads.}}
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] }
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1]
 }
 
 // -----
@@ -116,7 +116,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{unsupported dynamic sizes}}
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] }
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1]
 }
 
 // -----
@@ -138,7 +138,7 @@ transform.sequence failures(propagate) {
   %forall, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{only bufferized scf.forall can be mapped}}
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [128, 4, 1] }
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [128, 4, 1]
 }
 
 // -----
@@ -243,8 +243,8 @@ func.func @map_forall_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{Trying to launch a GPU kernel with gridDim = (65535, 65535, 1) blockDim = (1, 1, 1). It is larger than the limits.}}
-  %1 = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch }
+  // expected-error @below {{Trying to launch a GPU kernel with grid_dims = (65535, 65535, 1) block_dims = (1, 1, 1). It is larger than the limits.}}
+  %1 = transform.gpu.map_forall_to_blocks %funcop generate_gpu_launch
 }
 
 // -----
@@ -271,7 +271,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{duplicated attribute, cannot map 
diff erent loops to the same processor}}
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [32, 32, 1]}
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1]
 }
 
 // -----

diff  --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 447ff1597657d..fcf56c8024bfa 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -33,7 +33,7 @@ func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.gpu.map_forall_to_blocks %funcop { gridDim = [12, 9, 1]}
+  transform.gpu.map_forall_to_blocks %funcop grid_dims = [12, 9, 1]
 }
 
 // -----
@@ -87,7 +87,7 @@ func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !g
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1] }
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1]
 }
 
 // -----
@@ -127,7 +127,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
   %gpuLaunch = transform.gpu.map_forall_to_blocks %funcop { generate_gpu_launch }
-  transform.gpu.map_nested_forall_to_threads %gpuLaunch { blockDim = [32, 4, 1] }
+  transform.gpu.map_nested_forall_to_threads %gpuLaunch block_dims = [32, 4, 1]
 }
 
 // -----
@@ -160,7 +160,7 @@ func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false
 }
 
 // -----
@@ -192,7 +192,7 @@ func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token)
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [32, 1, 1]}
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 1, 1]
 }
 
 // -----
@@ -228,7 +228,7 @@ func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %s
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [12, 9, 1] sync_after_distribute = false
 }
 
 // -----
@@ -236,29 +236,64 @@ transform.sequence failures(propagate) {
 !type = memref<2 x 32 x f32>
 !type1d = memref<32 x f32>
 
+// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) floordiv 4)>
+// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 4) floordiv 2)>
+
+// CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 12)>
+// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) floordiv 20)>
+// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)>
+
 // CHECK-LABEL: func.func @map_multi_level(
 func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
   %one = arith.constant 1 : index
-  %c12 = arith.constant 12 : index
+  %c10 = arith.constant 10 : index
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
-// check that the thread level got distributed but not the warp level.
-//  CHECK-NOT:  {mapping = #gpu.thread
-//      CHECK:  {mapping = [#gpu.warp<x>]}
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C11:.*]] = arith.constant 11 : index
+  // CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
+  // CHECK-DAG: %[[C20:.*]] = arith.constant 20 : index
+
+  // check that both the thread level and the warp level got distributed.
+  //  CHECK-NOT: #gpu.thread
+  //  CHECK-NOT: #gpu.warp
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
+    // CHECK-DAG: %[[TIDX:.*]] = gpu.thread_id  x
+    // CHECK-DAG: %[[TIDY:.*]] = gpu.thread_id  y
     scf.forall (%i, %j) in (%c7, %c9) {
-        %4 = memref.load %x[%i, %j] : !type
-        %5 = memref.load %y[%i, %j] : !type
-        %6 = math.fma %alpha, %4, %5 : f32
-        memref.store %6, %y[%i, %j] : !type
-     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.forall (%i) in (%c12) {
+      %4 = memref.load %x[%i, %j] : !type
+      %5 = memref.load %y[%i, %j] : !type
+      %6 = math.fma %alpha, %4, %5 : f32
+      memref.store %6, %y[%i, %j] : !type
+    }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+
+    // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index
+    // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index
+    //     CHECK: %[[COND:.*]] = arith.andi %[[CMPY]], %[[CMPX]] : i1
+    //     CHECK: scf.if %[[COND]]
+    scf.forall (%i) in (%c1) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
      }  {mapping = [#gpu.warp<x>] }
+
+    // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]])
+    // CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index
+    //     CHECK: scf.if %[[COND]]
+    scf.forall (%i, %j) in (%c10, %c2) {
+        %7 = memref.load %t[%i] : !type1d
+        %8 = arith.addf %alpha, %7 : f32
+        memref.store %8, %t[%j] : !type1d
+     }  {mapping = [#gpu.linear<x>, #gpu.linear<y>] }
     gpu.terminator
   }
   return %y : !type
@@ -267,5 +302,6 @@ func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %str
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.gpu.map_nested_forall_to_threads %funcop { blockDim = [12, 9, 1] }
+  transform.gpu.map_nested_forall_to_threads %funcop
+    block_dims = [12, 11, 1] warp_dims = [2, 2, 1]
 }