[Mlir-commits] [mlir] 3f44495 - [mlir][GPU] Expose the functionality to create a GPUFuncOp from a LaunchOp

Thu Mar 5 11:04:39 PST 2020

Author: MaheshRavishankar
Date: 2020-03-05T11:03:51-08:00
New Revision: 3f44495dfd61deddb6573c9d506a7a843b355735

URL: https://github.com/llvm/llvm-project/commit/3f44495dfd61deddb6573c9d506a7a843b355735
DIFF: https://github.com/llvm/llvm-project/commit/3f44495dfd61deddb6573c9d506a7a843b355735.diff

LOG: [mlir][GPU] Expose the functionality to create a GPUFuncOp from a LaunchOp

The current setup of the GPU dialect is to model both the host and
device side codegen. For cases (like IREE) the host side modeling
might not directly fit its use case, but device-side codegen is still
valuable. First step in accessing just the device-side functionality
of the GPU dialect is to allow just creating a gpu.func operation from
a gpu.launch operation. In addition this change also "inlines"
operations into the gpu.func op at time of creation instead of this
being a later step.

Differential Revision: https://reviews.llvm.org/D75287

Added: 
    mlir/include/mlir/Dialect/GPU/Utils.h

Modified: 
    mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
    mlir/test/Dialect/GPU/outlining.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/Utils.h b/mlir/include/mlir/Dialect/GPU/Utils.h
new file mode 100644
index 000000000000..921408323ab0

--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils.h
@@ -0,0 +1,44 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines utility functions exposed by the GPU dialect
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_UTILS_H_
+#define MLIR_DIALECT_GPU_UTILS_H_
+
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+struct LogicalResult;
+class Value;
+
+namespace gpu {
+class GPUFuncOp;
+class LaunchOp;
+} // namespace gpu
+
+/// Get a gpu.func created from outlining the region of a gpu.launch op with the
+/// given `kernelFnName`. The region of the `launchOp` can use values from
+/// above. These need to be captured and passed as arguments to the generated
+/// gpu.func. The generated function has arguments
+/// - corresponding to the values passed in as `operands`, in that order.
+/// - any additional values that might be used within the region of the
+///   `launchOp` and defined above it. These captured values are appended to the
+///   `operands` list.
+gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName,
+                                 SmallVectorImpl<Value> &operands);
+
+/// Sink operations into the `launchOp` to reduce the number of values that are
+/// used within the region of the operation, but defined outside of the
+/// region.
+LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp);
+
+} // namespace mlir
+#endif // MLIR_DIALECT_GPU_UTILS_H_

diff  --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index e301aca7769b..99029b010ba3 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/GPU/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
@@ -32,12 +33,15 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
 }
 
 // Add operations generating block/thread ids and grid/block dimensions at the
-// beginning of the `body` region and replace uses of the respective function
-// arguments.
-static void injectGpuIndexOperations(Location loc, Region &body) {
+// beginning of the `launchFuncOpBody` region. Add mapping from argument in
+// entry block of `launchOpBody`, to the corresponding result value of the added
+// operations.
+static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
+                                     Region &launchOpBody,
+                                     BlockAndValueMapping &map) {
   OpBuilder builder(loc->getContext());
-  Block &firstBlock = body.front();
-  builder.setInsertionPointToStart(&firstBlock);
+  Block &firstBlock = launchOpBody.front();
+  builder.setInsertionPointToStart(&launchFuncOpBody.front());
   SmallVector<Value, 12> indexOps;
   createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
@@ -45,73 +49,89 @@ static void injectGpuIndexOperations(Location loc, Region &body) {
   createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
   // Replace the leading 12 function args with the respective thread/block index
   // operations. Iterate backwards since args are erased and indices change.
-  for (int i = 11; i >= 0; --i) {
-    firstBlock.getArgument(i).replaceAllUsesWith(indexOps[i]);
-    firstBlock.eraseArgument(i);
-  }
+  for (auto indexOp : enumerate(indexOps))
+    map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
 }
 
-static bool isInliningBeneficiary(Operation *op) {
+static bool isSinkingBeneficiary(Operation *op) {
   return isa<ConstantOp>(op) || isa<DimOp>(op);
 }
 
-// Move arguments of the given kernel function into the function if this reduces
-// the number of kernel arguments.
-static gpu::LaunchFuncOp inlineBeneficiaryOps(gpu::GPUFuncOp kernelFunc,
-                                              gpu::LaunchFuncOp launch) {
-  OpBuilder kernelBuilder(kernelFunc.getBody());
-  auto &firstBlock = kernelFunc.getBody().front();
-  SmallVector<Value, 8> newLaunchArgs;
-  BlockAndValueMapping map;
-  for (int i = 0, e = launch.getNumKernelOperands(); i < e; ++i) {
-    map.map(launch.getKernelOperand(i), kernelFunc.getArgument(i));
-  }
-  for (int i = launch.getNumKernelOperands() - 1; i >= 0; --i) {
-    auto operandOp = launch.getKernelOperand(i).getDefiningOp();
-    if (!operandOp || !isInliningBeneficiary(operandOp)) {
-      newLaunchArgs.push_back(launch.getKernelOperand(i));
+LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
+  Region &launchOpBody = launchOp.body();
+
+  // Identify uses from values defined outside of the scope of the launch
+  // operation.
+  llvm::SetVector<Value> sinkCandidates;
+  getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
+
+  llvm::SetVector<Value> sunkValues;
+  llvm::SetVector<Operation *> sunkOperations;
+  for (Value operand : sinkCandidates) {
+    Operation *operandOp = operand.getDefiningOp();
+    if (!operandOp || !isSinkingBeneficiary(operandOp))
       continue;
-    }
-    // Only inline operations that do not create new arguments.
-    if (!llvm::all_of(operandOp->getOperands(),
-                      [map](Value value) { return map.contains(value); })) {
+    // Only sink operations that do not create new sinkCandidates.
+    if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
+          return sinkCandidates.count(value);
+        }))
       continue;
-    }
-    auto clone = kernelBuilder.clone(*operandOp, map);
-    firstBlock.getArgument(i).replaceAllUsesWith(clone->getResult(0));
-    firstBlock.eraseArgument(i);
+    sunkValues.insert(operand);
+    sunkOperations.insert(operandOp);
   }
-  if (newLaunchArgs.size() == launch.getNumKernelOperands())
-    return launch;
-
-  std::reverse(newLaunchArgs.begin(), newLaunchArgs.end());
-  OpBuilder LaunchBuilder(launch);
-  SmallVector<Type, 8> newArgumentTypes;
-  newArgumentTypes.reserve(firstBlock.getNumArguments());
-  for (auto value : firstBlock.getArguments()) {
-    newArgumentTypes.push_back(value.getType());
+
+  // Insert operations so that the defs get cloned before uses.
+  BlockAndValueMapping map;
+  OpBuilder builder(launchOpBody);
+  DenseSet<Operation *> processed;
+  SmallVector<Operation *, 2> clonedOps;
+  while (processed.size() != sunkOperations.size()) {
+    auto startSize = processed.size();
+    for (Operation *sunkOperation : sunkOperations) {
+      if (processed.count(sunkOperation))
+        continue;
+
+      // Operation cant be cloned yet if any of its operands is also being sunk,
+      // but isnt cloned yet.
+      if (llvm::any_of(
+              sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
+                return sunkValues.count(value) && !map.lookupOrNull(value);
+              }))
+        continue;
+
+      Operation *clonedOp = builder.clone(*sunkOperation, map);
+      // Only replace uses within the launch op.
+      for (auto result : llvm::enumerate(sunkOperation->getResults())) {
+        auto replacement = clonedOp->getResult(result.index());
+        for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
+          if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
+            use.set(replacement);
+      }
+      processed.insert(sunkOperation);
+    }
+    if (startSize == processed.size())
+      return launchOp.emitError(
+          "found illegal cyclic dependency between operations while sinking");
   }
-  kernelFunc.setType(LaunchBuilder.getFunctionType(newArgumentTypes, {}));
-  auto newLaunch = LaunchBuilder.create<gpu::LaunchFuncOp>(
-      launch.getLoc(), kernelFunc, launch.getGridSizeOperandValues(),
-      launch.getBlockSizeOperandValues(), newLaunchArgs);
-  launch.erase();
-  return newLaunch;
+  return success();
 }
 
 // Outline the `gpu.launch` operation body into a kernel function. Replace
 // `gpu.terminator` operations by `gpu.return` in the generated function.
-static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp,
-                                        llvm::SetVector<Value> &operands) {
+static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
+                                            StringRef kernelFnName,
+                                            llvm::SetVector<Value> &operands) {
   Location loc = launchOp.getLoc();
   // Create a builder with no insertion point, insertion will happen separately
   // due to symbol table manipulation.
   OpBuilder builder(launchOp.getContext());
+  Region &launchOpBody = launchOp.body();
 
   // Identify uses from values defined outside of the scope of the launch
   // operation.
-  getUsedValuesDefinedAbove(launchOp.body(), operands);
+  getUsedValuesDefinedAbove(launchOpBody, operands);
 
+  // Create the gpu.func operation.
   SmallVector<Type, 4> kernelOperandTypes;
   kernelOperandTypes.reserve(operands.size());
   for (Value operand : operands) {
@@ -119,38 +139,68 @@ static gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp,
   }
   FunctionType type =
       FunctionType::get(kernelOperandTypes, {}, launchOp.getContext());
-  std::string kernelFuncName =
-      Twine(launchOp.getParentOfType<FuncOp>().getName(), "_kernel").str();
-  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFuncName, type);
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
   outlinedFunc.setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                        builder.getUnitAttr());
-  outlinedFunc.body().takeBody(launchOp.body());
-  injectGpuIndexOperations(loc, outlinedFunc.body());
-  Block &entryBlock = outlinedFunc.body().front();
-  for (Value operand : operands) {
-    BlockArgument newArg = entryBlock.addArgument(operand.getType());
-    replaceAllUsesInRegionWith(operand, newArg, outlinedFunc.body());
-  }
+  BlockAndValueMapping map;
+
+  // Map the arguments corresponding to the launch parameters like blockIdx,
+  // threadIdx, etc.
+  Region &outlinedFuncBody = outlinedFunc.body();
+  injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+
+  // Map arguments from gpu.launch region to the arguments of the gpu.func
+  // operation.
+  Block &entryBlock = outlinedFuncBody.front();
+  for (auto operand : enumerate(operands))
+    map.map(operand.value(), entryBlock.getArgument(operand.index()));
+
+  // Clone the region of the gpu.launch operation into the gpu.func operation.
+  // TODO(ravishankarm): If cloneInto can be modified such that if a mapping for
+  // a block exists, that block will be used to clone operations into (at the
+  // end of the block), instead of creating a new block, this would be much
+  // cleaner.
+  launchOpBody.cloneInto(&outlinedFuncBody, map);
+
+  // Branch from enty of the gpu.func operation to the block that is cloned from
+  // the entry block of the gpu.launch operation.
+  Block &launchOpEntry = launchOpBody.front();
+  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
+  builder.setInsertionPointToEnd(&entryBlock);
+  builder.create<BranchOp>(loc, clonedLaunchOpEntry);
+
   outlinedFunc.walk([](gpu::TerminatorOp op) {
     OpBuilder replacer(op);
     replacer.create<gpu::ReturnOp>(op.getLoc());
     op.erase();
   });
-
   return outlinedFunc;
 }
 
+gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
+                                       StringRef kernelFnName,
+                                       llvm::SmallVectorImpl<Value> &operands) {
+  DenseSet<Value> inputOperandSet;
+  inputOperandSet.insert(operands.begin(), operands.end());
+  llvm::SetVector<Value> operandSet(operands.begin(), operands.end());
+  auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
+  for (auto operand : operandSet) {
+    if (!inputOperandSet.count(operand))
+      operands.push_back(operand);
+  }
+  return funcOp;
+}
+
 // Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
 // `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
 // constant region arguments inlined.
-static void convertToLaunchFuncOp(gpu::LaunchOp &launchOp,
+static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
                                   gpu::GPUFuncOp kernelFunc,
                                   ValueRange operands) {
   OpBuilder builder(launchOp);
-  auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
+  builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
       launchOp.getBlockSizeOperandValues(), operands);
-  inlineBeneficiaryOps(kernelFunc, launchFuncOp);
   launchOp.erase();
 }
 
@@ -173,9 +223,16 @@ class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
     for (auto func : getModule().getOps<FuncOp>()) {
       // Insert just after the function.
       Block::iterator insertPt(func.getOperation()->getNextNode());
-      func.walk([&](gpu::LaunchOp op) {
+      auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         llvm::SetVector<Value> operands;
-        gpu::GPUFuncOp outlinedFunc = outlineKernelFunc(op, operands);
+        std::string kernelFnName =
+            Twine(op.getParentOfType<FuncOp>().getName(), "_kernel").str();
+
+        // Pull in instructions that can be sunk
+        if (failed(sinkOperationsIntoLaunchOp(op)))
+          return WalkResult::interrupt();
+        gpu::GPUFuncOp outlinedFunc =
+            outlineKernelFuncImpl(op, kernelFnName, operands);
 
         // Create nested module and insert outlinedFunc. The module will
         // originally get the same name as the function, but may be renamed on
@@ -186,7 +243,10 @@ class GpuKernelOutliningPass : public ModulePass<GpuKernelOutliningPass> {
         // Potentially changes signature, pulling in constants.
         convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
         modified = true;
+        return WalkResult::advance();
       });
+      if (funcWalkResult.wasInterrupted())
+        return signalPassFailure();
     }
 
     // If any new module was inserted in this module, annotate this module as

diff  --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index ece5b030b7ed..7d9a6011eb66 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -51,6 +51,8 @@ func @launch() {
 // CHECK-NEXT: %[[BDIM:.*]] = "gpu.block_dim"() {dimension = "x"} : () -> index
 // CHECK-NEXT: = "gpu.block_dim"() {dimension = "y"} : () -> index
 // CHECK-NEXT: = "gpu.block_dim"() {dimension = "z"} : () -> index
+// CHECK-NEXT: br ^[[BLOCK:.*]]
+// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
 // CHECK-NEXT: = load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
@@ -108,6 +110,28 @@ func @extra_constants(%arg0 : memref<?xf32>) {
 
 // -----
 
+func @multiple_uses(%arg0 : memref<?xf32>) {
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  // CHECK: gpu.func {{.*}} {
+  // CHECK: %[[C2:.*]] = constant 2 : index
+  // CHECK: "use1"(%[[C2]], %[[C2]])
+  // CHECK: "use2"(%[[C2]])
+  // CHECK: gpu.return
+  // CHECK: }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
+                                       %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
+	                                %block_z = %c1) {
+    "use1"(%c2, %c2) : (index, index) -> ()
+    "use2"(%c2) : (index) -> ()
+    gpu.terminator
+  }
+  return
+}
+
+// -----
+
 llvm.mlir.global internal @global(42 : i64) : !llvm.i64
 
 func @function_call(%arg0 : memref<?xf32>) {