[Mlir-commits] [mlir] 92f088d - [mlir][gpu][transform] Provide better error messages and avoid crashing in MapForallToBlocks.

Mon Sep 4 07:11:47 PDT 2023

Author: Nicolas Vasilache
Date: 2023-09-04T14:11:38Z
New Revision: 92f088d335e4a55f10e74c541ac52ec2a4d4ceeb

URL: https://github.com/llvm/llvm-project/commit/92f088d335e4a55f10e74c541ac52ec2a4d4ceeb
DIFF: https://github.com/llvm/llvm-project/commit/92f088d335e4a55f10e74c541ac52ec2a4d4ceeb.diff

LOG: [mlir][gpu][transform] Provide better error messages and avoid crashing in MapForallToBlocks.

This revision addresses issues surfaced in https://reviews.llvm.org/D159093

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
    mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
    mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
    mlir/test/Dialect/GPU/transform-gpu-failing.mlir
    mlir/test/python/dialects/transform_gpu_ext.py

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
index 3a58c7ccd0997d..616a82b08a6148 100644

--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -298,6 +298,8 @@ def MapForallToBlocks :
     attr-dict
     `:` functional-type($target, $result)
   }];
+  let hasVerifier = 1;
+  
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
         ::mlir::transform::TransformRewriter &rewriter,

diff  --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 71db26999a4944..6434f99838f005 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -149,10 +149,10 @@ def BufferizeToAllocationOp : Op<Transform_Dialect,
                        DefaultValuedAttr<StrAttr, "\"memref.alloc\"">:
                            $alloc_op,
                        UnitAttr:$bufferize_destination_only);
-  let hasVerifier = 1;
   let results = (outs Transform_AnyValue:$allocated_buffer,
                       Transform_AnyOpType:$new_ops);
   let assemblyFormat = "$target attr-dict `:` type($target)";
+  let hasVerifier = 1;
 
   let builders = [
     OpBuilder<(ins "Value":$target, "Attribute":$memorySpace)>,
@@ -1325,8 +1325,8 @@ def SplitOp : Op<Transform_Dialect, "structured.split",
                        I64Attr:$static_split_point);
   let results = (outs TransformHandleTypeInterface:$first,
                       TransformHandleTypeInterface:$second);
-  let hasVerifier = 1;
   let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2131,7 +2131,6 @@ def MaskedVectorizeOp : Op<Transform_Dialect, "structured.masked_vectorize",
       attr-dict
       `:` type($target)
   }];
-
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{

diff  --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index aaf337f36a6081..835cd978cd3a4e 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <type_traits>
 
 using namespace mlir;
 using namespace mlir::gpu;
@@ -835,6 +836,13 @@ void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
 // Block and thread mapping utilities.
 //===----------------------------------------------------------------------===//
 
+namespace {
+/// Local types used for mapping verification.
+struct MappingKind {};
+struct BlockMappingKind : MappingKind {};
+struct ThreadMappingKind : MappingKind {};
+} // namespace
+
 static DiagnosedSilenceableFailure
 definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
                       Operation *target, const Twine &message) {
@@ -844,12 +852,14 @@ definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
 }
 
 /// Check if given mapping attributes are one of the desired attributes
+template <typename MappingKindType>
 static DiagnosedSilenceableFailure
 checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
                            scf::ForallOp forallOp) {
-  if (!forallOp.getMapping().has_value())
+  if (!forallOp.getMapping().has_value()) {
     return definiteFailureHelper(transformOp, forallOp,
-                                 "mapping must be present");
+                                 "scf.forall op requires a mapping attribute");
+  }
 
   bool hasBlockMapping =
       llvm::any_of(forallOp.getMapping().value(), [](Attribute attr) {
@@ -877,6 +887,18 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
         transformOp, forallOp,
         "cannot mix 
diff erent mapping types, use nesting");
   }
+  if (std::is_same<MappingKindType, BlockMappingKind>::value &&
+      !hasBlockMapping) {
+    return definiteFailureHelper(
+        transformOp, forallOp,
+        "scf.forall op requires a mapping attribute of kind 'block'");
+  }
+  if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
+      !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
+    return definiteFailureHelper(transformOp, forallOp,
+                                 "scf.forall op requires a mapping attribute "
+                                 "of kind 'thread' or 'warp'");
+  }
 
   DenseSet<Attribute> seen;
   for (Attribute map : forallOp.getMapping()->getValue()) {
@@ -902,12 +924,13 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
   return DiagnosedSilenceableFailure::success();
 }
 
+template <typename MappingKindType>
 static DiagnosedSilenceableFailure
 verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
                  scf::ForallOp forallOp) {
   // Check the types of the mapping attributes match.
   DiagnosedSilenceableFailure typeRes =
-      checkMappingAttributeTypes(transformOp, forallOp);
+      checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
   if (!typeRes.succeeded())
     return typeRes;
 
@@ -967,13 +990,6 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
     ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
   LDBG("--start rewriteOneForallCommonImpl");
 
-  // Step 0. GPU-specific verifications. There is no better place to anchor
-  // those right now: the ForallOp is target-independent and the transform
-  // op does not apply to individual ForallOp.
-  DiagnosedSilenceableFailure diag = verifyGpuMapping(transformOp, forallOp);
-  if (!diag.succeeded())
-    return diag;
-
   // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
   auto numParallelIterations =
       getConstantIntValues(forallOp.getMixedUpperBound());
@@ -1143,6 +1159,16 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
     const GpuIdBuilder &gpuIdBuilder) {
   LDBG("Start mapForallToBlocksImpl");
 
+  {
+    // GPU-specific verifications. There is no better place to anchor
+    // those right now: the ForallOp is target-independent and the transform
+    // op does not apply to individual ForallOp.
+    DiagnosedSilenceableFailure diag =
+        verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
+    if (!diag.succeeded())
+      return diag;
+  }
+
   Location loc = forallOp.getLoc();
   Block *parentBlock = forallOp->getBlock();
   Value zero;
@@ -1194,7 +1220,7 @@ mlir::transform::gpu::findTopLevelForallOp(Operation *target,
     return WalkResult::advance();
   });
 
-  if (walkResult.wasInterrupted())
+  if (walkResult.wasInterrupted() || !topLevelForallOp)
     return transformOp.emitSilenceableError()
            << "could not find a unique topLevel scf.forall";
   return DiagnosedSilenceableFailure::success();
@@ -1222,6 +1248,7 @@ DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
     diag.attachNote(target->getLoc()) << "when applied to this payload op";
     return diag;
   }
+  assert(topLevelForallOp && "expect an scf.forall");
 
   SmallVector<int64_t> gridDims{getGridDims()};
   if (!getGenerateGpuLaunch() && gridDims.size() != 3)
@@ -1244,9 +1271,12 @@ DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
   }
 
   // The BlockIdBuilder adapts to whatever is thrown at it.
-  auto mappingAttr = cast<DeviceMappingAttrInterface>(
-      topLevelForallOp.getMapping()->getValue().front());
-  bool useLinearMapping = mappingAttr.isLinearMapping();
+  bool useLinearMapping = false;
+  if (topLevelForallOp.getMapping()) {
+    auto mappingAttr = cast<DeviceMappingAttrInterface>(
+        topLevelForallOp.getMapping()->getValue().front());
+    useLinearMapping = mappingAttr.isLinearMapping();
+  }
   GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping);
 
   diag = mlir::transform::gpu::mapForallToBlocksImpl(
@@ -1264,6 +1294,13 @@ DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
   return diag;
 }
 
+LogicalResult transform::MapForallToBlocks::verify() {
+  if (!getGridDims().empty() && getGridDims().size() != 3) {
+    return emitOpError() << "transform requires empty or size-3 grid_dims";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MapNestedForallToThreads
 //===----------------------------------------------------------------------===//
@@ -1283,8 +1320,8 @@ static DiagnosedSilenceableFailure checkMappingSpec(
       computeProduct(blockOrGridSizes)) {
     auto diag = definiteFailureHelper(
         transformOp, forallOp,
-        Twine(
-            "the number of required parallel resources (blocks or threads) ") +
+        Twine("the number of required parallel resources (blocks or "
+              "threads) ") +
             std::to_string(computeProduct(numParallelIterations) * factor) +
             std::string(" overflows the number of available resources ") +
             std::to_string(computeProduct(blockOrGridSizes)));
@@ -1345,6 +1382,16 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
     scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
     bool syncAfterDistribute) {
 
+  {
+    // GPU-specific verifications. There is no better place to anchor
+    // those right now: the ForallOp is target-independent and the transform
+    // op does not apply to individual ForallOp.
+    DiagnosedSilenceableFailure diag =
+        verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
+    if (!diag.succeeded())
+      return diag;
+  }
+
   GpuIdBuilder gpuIdBuilder;
   {
     // Try to construct the id builder, if it fails, return.

diff  --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
index 564fdd46e91b17..74b34a88fe73c3 100644
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -328,3 +328,92 @@ transform.sequence failures(propagate) {
   // expected-error @below {{cannot mix linear and non-linear mapping modes}}
   transform.gpu.map_nested_forall_to_threads %funcop block_dims = [32, 32, 1] : (!transform.any_op) -> !transform.any_op
 }
+
+// -----
+
+// expected-note @below {{when applied to this payload op}}
+module {
+transform.sequence failures(propagate) {
+^bb1(%op: !transform.any_op):
+  // expected-error @below {{could not find a unique topLevel scf.forall}}
+  %gpu_launch = transform.gpu.map_forall_to_blocks %op generate_gpu_launch grid_dims = [1, 1, 1]
+    : (!transform.any_op) -> !transform.any_op
+}
+}
+
+// -----
+
+func.func public @improperly_sized_grid_dims(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
+  scf.forall (%arg3, %arg4) in (1, 1) {
+    linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>)
+  } {mapping = [#gpu.block<x>, #gpu.block<y>]}
+  return
+}
+
+transform.sequence  failures(propagate) {
+^bb0(%arg1: !transform.any_op):
+  %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{transform requires empty or size-3 grid_dims}}
+  %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16] : (!transform.any_op) -> !transform.any_op
+}
+
+// -----
+
+func.func public @missing_mapping_attribute(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
+  scf.forall (%arg3, %arg4) in (1, 1) {
+    linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>)
+  }
+  return
+}
+
+transform.sequence  failures(propagate) {
+^bb0(%arg1: !transform.any_op):
+  %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{scf.forall op requires a mapping attribute}}
+  %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16, 1] : (!transform.any_op) -> !transform.any_op
+}
+
+// -----
+
+func.func public @not_a_block_mapping_attribute(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>, %arg2: memref<32x32xf32>) {
+  scf.forall (%arg3, %arg4) in (1, 1) {
+    linalg.matmul ins(%arg0, %arg1 : memref<32x32xf32>, memref<32x32xf32>) outs(%arg2 : memref<32x32xf32>)
+  } {mapping = [#gpu.thread<x>, #gpu.thread<y>]}
+  return
+}
+
+transform.sequence  failures(propagate) {
+^bb0(%arg1: !transform.any_op):
+  %arg0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{scf.forall op requires a mapping attribute of kind 'block'}}
+  %5 = transform.gpu.map_forall_to_blocks %arg1 generate_gpu_launch grid_dims = [50, 16, 1] : (!transform.any_op) -> !transform.any_op
+}
+
+// -----
+
+func.func @not_a_thread_or_warp_mapping_attribute(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
+  %one = arith.constant 1 : index
+  %c900 = arith.constant 900 : index
+  %c9 = arith.constant 9 : index
+  %c7 = arith.constant 7 : index
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  {
+    scf.forall (%i, %j) in (%c7, %c900) {
+        %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
+        %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
+        %6 = math.fma %alpha, %4, %5 : f32
+        memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
+     }  { mapping = [#gpu.block<y>, #gpu.block<x>] }
+    gpu.terminator
+  }
+
+  return %y : memref<2 x 32 x f32>
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !transform.any_op):
+  %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{scf.forall op requires a mapping attribute of kind 'thread' or 'warp'}}
+  transform.gpu.map_nested_forall_to_threads %funcop block_dims = [1, 1, 1] : (!transform.any_op) -> !transform.any_op
+}

diff  --git a/mlir/test/python/dialects/transform_gpu_ext.py b/mlir/test/python/dialects/transform_gpu_ext.py
index db2899592609c5..17f5a6d38f8c1a 100644
--- a/mlir/test/python/dialects/transform_gpu_ext.py
+++ b/mlir/test/python/dialects/transform_gpu_ext.py
@@ -42,10 +42,10 @@ def testMapForallToBlocksTyped(target):
 
 @run
 def testMapForallToBlocksGridDims(target):
-    gpu.MapForallToBlocks(target, grid_dims=[4, 2])
+    gpu.MapForallToBlocks(target, grid_dims=[4, 2, 1])
     # CHECK-LABEL: TEST: testMapForallToBlocksGridDims
     # CHECK: = transform.gpu.map_forall_to_blocks
-    # CHECK-SAME: grid_dims = [4, 2]
+    # CHECK-SAME: grid_dims = [4, 2, 1]
     # CHECK-SAME: (!transform.any_op) -> !transform.any_op