[Mlir-commits] [mlir] edeff6e - [mlir][GPU] Improve constant sinking in kernel outlining

Tue Sep 29 05:46:26 PDT 2020

Author: Stephan Herhut
Date: 2020-09-29T14:46:15+02:00
New Revision: edeff6e642e66a5be05c11cb8b9b36b3383078ae

URL: https://github.com/llvm/llvm-project/commit/edeff6e642e66a5be05c11cb8b9b36b3383078ae
DIFF: https://github.com/llvm/llvm-project/commit/edeff6e642e66a5be05c11cb8b9b36b3383078ae.diff

LOG: [mlir][GPU] Improve constant sinking in kernel outlining

The previous implementation did not support sinking simple expressions. In particular,
it is often beneficial to sink dim operations.

Differential Revision: https://reviews.llvm.org/D88439

Added: 
    

Modified: 
    mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
    mlir/test/Dialect/GPU/outlining.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index fcae3114188a..689161ed1fa2 100644

--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"
 
 using namespace mlir;
@@ -32,10 +33,10 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
   }
 }
 
-// Add operations generating block/thread ids and grid/block dimensions at the
-// beginning of the `launchFuncOpBody` region. Add mapping from argument in
-// entry block of `launchOpBody`, to the corresponding result value of the added
-// operations.
+/// Adds operations generating block/thread ids and grid/block dimensions at the
+/// beginning of the `launchFuncOpBody` region. Add mapping from argument in
+/// entry block of `launchOpBody`, to the corresponding result value of the
+/// added operations.
 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
                                      Region &launchOpBody,
                                      BlockAndValueMapping &map) {
@@ -53,8 +54,48 @@ static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
     map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
 }
 
+/// Identifies operations that are beneficial to sink into kernels. These
+/// operations may not have side-effects, as otherwise sinking (and hence
+/// duplicating them) is not legal.
 static bool isSinkingBeneficiary(Operation *op) {
-  return isa<ConstantOp, DimOp>(op);
+  return isa<ConstantOp, DimOp, SelectOp, CmpIOp>(op);
+}
+
+/// For a given operation `op`, computes whether it is beneficial to sink the
+/// operation into the kernel. An operation can be sunk if doing so does not
+/// introduce new kernel arguments. Whether a value is already available in the
+/// kernel (and hence does not introduce new arguments) is checked by
+/// querying `availableValues`.
+/// If an operand is not yet available, we recursively check whether it can be
+/// made available by siking its defining op.
+/// Operations that are indentified for sinking are added to `beneficiaryOps` in
+/// the order the should appear in the kernel. Furthermore, `availableValues` is
+/// updated with results that will be available after sinking the identified
+/// ops.
+static bool extractBeneficiaryOps(Operation *op,
+                                  llvm::SetVector<Operation *> &beneficiaryOps,
+                                  llvm::SetVector<Value> &availableValues) {
+  if (beneficiaryOps.count(op))
+    return true;
+
+  if (!isSinkingBeneficiary(op))
+    return false;
+
+  for (Value operand : op->getOperands()) {
+    // It is already visisble in the kernel, keep going.
+    if (availableValues.count(operand))
+      continue;
+    // Else check whether it can be made available via sinking.
+    Operation *definingOp = operand.getDefiningOp();
+    if (!definingOp ||
+        !extractBeneficiaryOps(definingOp, beneficiaryOps, availableValues))
+      return false;
+  }
+  // We will sink the operation, mark its results as now available.
+  beneficiaryOps.insert(op);
+  for (Value result : op->getResults())
+    availableValues.insert(result);
+  return true;
 }
 
 LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
@@ -65,59 +106,30 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
   llvm::SetVector<Value> sinkCandidates;
   getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
 
-  llvm::SetVector<Value> sunkValues;
-  llvm::SetVector<Operation *> sunkOperations;
-  for (Value operand : sinkCandidates) {
+  SmallVector<Value, 4> worklist(sinkCandidates.begin(), sinkCandidates.end());
+  llvm::SetVector<Operation *> toBeSunk;
+  for (Value operand : worklist) {
     Operation *operandOp = operand.getDefiningOp();
-    if (!operandOp || !isSinkingBeneficiary(operandOp))
+    if (!operandOp)
       continue;
-    // Only sink operations that do not create new sinkCandidates.
-    if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
-          return sinkCandidates.count(value);
-        }))
-      continue;
-    sunkValues.insert(operand);
-    sunkOperations.insert(operandOp);
+    extractBeneficiaryOps(operandOp, toBeSunk, sinkCandidates);
   }
 
   // Insert operations so that the defs get cloned before uses.
   BlockAndValueMapping map;
   OpBuilder builder(launchOpBody);
-  DenseSet<Operation *> processed;
-  SmallVector<Operation *, 2> clonedOps;
-  while (processed.size() != sunkOperations.size()) {
-    auto startSize = processed.size();
-    for (Operation *sunkOperation : sunkOperations) {
-      if (processed.count(sunkOperation))
-        continue;
-
-      // Operation cant be cloned yet if any of its operands is also being sunk,
-      // but isnt cloned yet.
-      if (llvm::any_of(
-              sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
-                return sunkValues.count(value) && !map.lookupOrNull(value);
-              }))
-        continue;
-
-      Operation *clonedOp = builder.clone(*sunkOperation, map);
-      // Only replace uses within the launch op.
-      for (auto result : llvm::enumerate(sunkOperation->getResults())) {
-        auto replacement = clonedOp->getResult(result.index());
-        for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
-          if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
-            use.set(replacement);
-      }
-      processed.insert(sunkOperation);
-    }
-    if (startSize == processed.size())
-      return launchOp.emitError(
-          "found illegal cyclic dependency between operations while sinking");
+  for (Operation *op : toBeSunk) {
+    Operation *clonedOp = builder.clone(*op, map);
+    // Only replace uses within the launch op.
+    for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
+      replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
+                                 launchOp.body());
   }
   return success();
 }
 
-// Outline the `gpu.launch` operation body into a kernel function. Replace
-// `gpu.terminator` operations by `gpu.return` in the generated function.
+/// Outline the `gpu.launch` operation body into a kernel function. Replace
+/// `gpu.terminator` operations by `gpu.return` in the generated function.
 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
                                             StringRef kernelFnName,
                                             llvm::SetVector<Value> &operands) {
@@ -191,9 +203,9 @@ gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
   return funcOp;
 }
 
-// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
-// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
-// constant region arguments inlined.
+/// Replace `gpu.launch` operations with an `gpu.launch_func` operation
+/// launching `kernelFunc`. The kernel func contains the body of the
+/// `gpu.launch` with constant region arguments inlined.
 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
                                   gpu::GPUFuncOp kernelFunc,
                                   ValueRange operands) {
@@ -257,7 +269,7 @@ class GpuKernelOutliningPass
   }
 
 private:
-  // Returns a gpu.module containing kernelFunc and all callees (recursive).
+  /// Returns a gpu.module containing kernelFunc and all callees (recursive).
   gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
                                       const SymbolTable &parentSymbolTable) {
     // TODO: This code cannot use an OpBuilder because it must be inserted into

diff  --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 23a8b9d98881..d43bbc2eb992 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -60,7 +60,7 @@ func @launch() {
 // -----
 
 // CHECK: module attributes {gpu.container_module}
-
+// CHECK-LABEL: @multiple_launches
 func @multiple_launches() {
   // CHECK: %[[CST:.*]] = constant 8 : index
   %cst = constant 8 : index
@@ -88,13 +88,66 @@ func @multiple_launches() {
 
 // -----
 
-func @extra_constants(%arg0 : memref<?xf32>) {
+// CHECK-LABEL: @extra_constants_not_inlined
+func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
+  // CHECK: %[[CST:.*]] = constant 8 : index
+  %cst = constant 8 : index
+  %cst2 = constant 2 : index
+  %c0 = constant 0 : index
+  %cst3 = "secret_constant"() : () -> index
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
+// CHECK: constant 2
+
+// -----
+
+// CHECK-LABEL: @extra_constants
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
+func @extra_constants(%arg0: memref<?xf32>) {
   // CHECK: %[[CST:.*]] = constant 8 : index
   %cst = constant 8 : index
   %cst2 = constant 2 : index
   %c0 = constant 0 : index
   %cst3 = dim %arg0, %c0 : memref<?xf32>
-  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]]) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+                                       %grid_z = %cst)
+             threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+                                        %block_z = %cst) {
+    "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: func @extra_constants_kernel
+// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
+// CHECK: constant 2
+// CHECK: constant 0
+// CHECK: dim %[[KARG0]]
+
+// -----
+
+// CHECK-LABEL: @extra_constants_noarg
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
+func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
+  // CHECK: %[[CST:.*]] = constant 8 : index
+  %cst = constant 8 : index
+  %cst2 = constant 2 : index
+  %c0 = constant 0 : index
+  // CHECK: dim %[[ARG1]]
+  %cst3 = dim %arg1, %c0 : memref<?xf32>
+  // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]], %{{.*}}) {kernel = @extra_constants_noarg_kernel::@extra_constants_noarg_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
                                        %grid_z = %cst)
              threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
@@ -105,9 +158,10 @@ func @extra_constants(%arg0 : memref<?xf32>) {
   return
 }
 
-// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
-// CHECK: constant
-// CHECK: constant
+// CHECK-LABEL: func @extra_constants_noarg_kernel
+// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
+// CHECK: %[[KCST:.*]] = constant 2
+// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]])
 
 // -----
 
@@ -135,6 +189,7 @@ func @multiple_uses(%arg0 : memref<?xf32>) {
 
 llvm.mlir.global internal @global(42 : i64) : !llvm.i64
 
+//CHECK-LABEL: @function_call
 func @function_call(%arg0 : memref<?xf32>) {
   %cst = constant 8 : index
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,