[Mlir-commits] [mlir] 5da2423 - [mlir][gpu] Only transform mapped parallel loops to GPU.

Fri Nov 13 00:22:34 PST 2020

Author: Stephan Herhut
Date: 2020-11-13T09:15:17+01:00
New Revision: 5da2423bc02f83598405fdfc532de0faa3502ec7

URL: https://github.com/llvm/llvm-project/commit/5da2423bc02f83598405fdfc532de0faa3502ec7
DIFF: https://github.com/llvm/llvm-project/commit/5da2423bc02f83598405fdfc532de0faa3502ec7.diff

LOG: [mlir][gpu] Only transform mapped parallel loops to GPU.

This exposes a hook to configure legality of operations such that only
`scf.parallel` operations that have mapping attributes are marked as
illegal. Consequently, the transformation can now also be applied to
mixed forms.

Differential Revision: https://reviews.llvm.org/D91340

Added: 
    

Modified: 
    mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
    mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
    mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
    mlir/test/Conversion/SCFToGPU/parallel_loop.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
index 900bc6e3fe84..d6316f663aa8 100644

--- a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
+++ b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
@@ -12,9 +12,10 @@
 
 namespace mlir {
 class AffineForOp;
+class ConversionTarget;
+struct LogicalResult;
 class MLIRContext;
 class OwningRewritePatternList;
-struct LogicalResult;
 class Value;
 
 namespace scf {
@@ -44,6 +45,10 @@ LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp,
 void populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
                                        MLIRContext *ctx);
 
+/// Configures the rewrite target such that only `scf.parallel` operations that
+/// are not rewritten by the provided patterns are legal.
+void configureParallelLoopToGPULegality(ConversionTarget &target);
+
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_SCFTOGPU_SCFTOGPU_H_

diff  --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index d494d12d0e4f..b7b4e7aab859 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -458,9 +458,10 @@ static LogicalResult processParallelLoop(
           if (!boundIsPrecise) {
             upperBound = deriveStaticUpperBound(upperBound, rewriter);
             if (!upperBound) {
-              return parallelOp.emitOpError()
-                     << "cannot derive loop-invariant upper bound for number "
-                        "of iterations";
+              return rewriter.notifyMatchFailure(
+                  parallelOp,
+                  "cannot derive loop-invariant upper bound for number of"
+                  "iterations");
             }
           }
           // Compute the number of iterations needed. We compute this as an
@@ -481,9 +482,9 @@ static LogicalResult processParallelLoop(
           // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
           // when this condition is relaxed.
           if (bounds.find(processor) != bounds.end()) {
-            return parallelOp.emitOpError()
-                   << "cannot redefine the bound for processor "
-                   << static_cast<int64_t>(processor);
+            return rewriter.notifyMatchFailure(
+                parallelOp, "cannot redefine the bound for processor " +
+                                Twine(static_cast<int64_t>(processor)));
           }
           bounds[processor] = launchBound;
         }
@@ -565,6 +566,10 @@ static LogicalResult processParallelLoop(
 LogicalResult
 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
                                              PatternRewriter &rewriter) const {
+  // We can only transform starting at the outer-most loop. Launches inside of
+  // parallel loops are not supported.
+  if (auto parentLoop = parallelOp.getParentOfType<ParallelOp>())
+    return failure();
   // Create a launch operation. We start with bound one for all grid/block
   // sizes. Those will be refined later as we discover them from mappings.
   Location loc = parallelOp.getLoc();
@@ -640,3 +645,9 @@ void mlir::populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
                                              MLIRContext *ctx) {
   patterns.insert<ParallelToGpuLaunchLowering>(ctx);
 }
+
+void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
+  target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
+    return !parallelOp.getAttr(gpu::getMappingAttrName());
+  });
+}

diff  --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
index d04a773939b7..2941b400babe 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
@@ -53,7 +53,7 @@ struct ParallelLoopToGpuPass
     target.addLegalDialect<AffineDialect>();
     target.addLegalDialect<gpu::GPUDialect>();
     target.addLegalDialect<scf::SCFDialect>();
-    target.addIllegalOp<scf::ParallelOp>();
+    configureParallelLoopToGPULegality(target);
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();

diff  --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
index 3af50da6165f..2454ced8ac35 100644
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -317,15 +317,13 @@ func @parallel_loop_optional_attr() {
 
 // -----
 
-// Mapping to the same processor twice.
+// Mapping to the same processor twice. Cannot be mapped.
 
 func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
                           %arg3 : index,
                           %buf : memref<?x?xf32>,
                           %res : memref<?x?xf32>) {
   %four = constant 4 : index
-  // expected-error at +2 {{cannot redefine the bound for processor 1}}
-  // expected-error at +1 {{failed to legalize operation 'scf.parallel'}}
   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                           step (%four, %four)  {
   } { mapping = [
@@ -335,9 +333,12 @@ func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
   return
 }
 
+// CHECK-LABEL: @parallel_double_map
+// CHECK: scf.parallel
+
 // -----
 
-// Loop with loop-variant upper bound.
+// Loop with loop-variant upper bound. Cannot be mapped.
 
 func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
                                        %arg3 : index,
@@ -346,10 +347,8 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind
   %zero = constant 0 : index
   %one = constant 1 : index
   %four = constant 4 : index
-  // expected-error at +1 {{failed to legalize operation 'scf.parallel'}}
   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                           step (%four, %four)  {
-    // expected-error at +1 {{cannot derive loop-invariant upper bound}}
     scf.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
                                             step (%one, %one)  {
       %idx0 = addi %i0, %si0 : index
@@ -366,3 +365,25 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind
     ] }
   return
 }
+
+// CHECK-LABEL: @parallel_loop_loop_variant_bound
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+// Loop without annotations. Cannot be mapped.
+
+func @parallel_no_annotations(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %four = constant 4 : index
+  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+  }
+  return
+}
+
+// CHECK-LABEL: @parallel_no_annotations
+// CHECK: scf.parallel