[Mlir-commits] [mlir] edeff6e - [mlir][GPU] Improve constant sinking in kernel outlining
Stephan Herhut
llvmlistbot at llvm.org
Tue Sep 29 05:46:26 PDT 2020
Author: Stephan Herhut
Date: 2020-09-29T14:46:15+02:00
New Revision: edeff6e642e66a5be05c11cb8b9b36b3383078ae
URL: https://github.com/llvm/llvm-project/commit/edeff6e642e66a5be05c11cb8b9b36b3383078ae
DIFF: https://github.com/llvm/llvm-project/commit/edeff6e642e66a5be05c11cb8b9b36b3383078ae.diff
LOG: [mlir][GPU] Improve constant sinking in kernel outlining
The previous implementation did not support sinking simple expressions. In particular,
it is often beneficial to sink dim operations.
Differential Revision: https://reviews.llvm.org/D88439
Added:
Modified:
mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
mlir/test/Dialect/GPU/outlining.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index fcae3114188a..689161ed1fa2 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -18,6 +18,7 @@
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/RegionUtils.h"
using namespace mlir;
@@ -32,10 +33,10 @@ static void createForAllDimensions(OpBuilder &builder, Location loc,
}
}
-// Add operations generating block/thread ids and grid/block dimensions at the
-// beginning of the `launchFuncOpBody` region. Add mapping from argument in
-// entry block of `launchOpBody`, to the corresponding result value of the added
-// operations.
+/// Adds operations generating block/thread ids and grid/block dimensions at the
+/// beginning of the `launchFuncOpBody` region. Add mapping from argument in
+/// entry block of `launchOpBody`, to the corresponding result value of the
+/// added operations.
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
Region &launchOpBody,
BlockAndValueMapping &map) {
@@ -53,8 +54,48 @@ static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
}
+/// Identifies operations that are beneficial to sink into kernels. These
+/// operations may not have side-effects, as otherwise sinking (and hence
+/// duplicating them) is not legal.
static bool isSinkingBeneficiary(Operation *op) {
- return isa<ConstantOp, DimOp>(op);
+ return isa<ConstantOp, DimOp, SelectOp, CmpIOp>(op);
+}
+
+/// For a given operation `op`, computes whether it is beneficial to sink the
+/// operation into the kernel. An operation can be sunk if doing so does not
+/// introduce new kernel arguments. Whether a value is already available in the
+/// kernel (and hence does not introduce new arguments) is checked by
+/// querying `availableValues`.
+/// If an operand is not yet available, we recursively check whether it can be
+/// made available by siking its defining op.
+/// Operations that are indentified for sinking are added to `beneficiaryOps` in
+/// the order the should appear in the kernel. Furthermore, `availableValues` is
+/// updated with results that will be available after sinking the identified
+/// ops.
+static bool extractBeneficiaryOps(Operation *op,
+ llvm::SetVector<Operation *> &beneficiaryOps,
+ llvm::SetVector<Value> &availableValues) {
+ if (beneficiaryOps.count(op))
+ return true;
+
+ if (!isSinkingBeneficiary(op))
+ return false;
+
+ for (Value operand : op->getOperands()) {
+ // It is already visisble in the kernel, keep going.
+ if (availableValues.count(operand))
+ continue;
+ // Else check whether it can be made available via sinking.
+ Operation *definingOp = operand.getDefiningOp();
+ if (!definingOp ||
+ !extractBeneficiaryOps(definingOp, beneficiaryOps, availableValues))
+ return false;
+ }
+ // We will sink the operation, mark its results as now available.
+ beneficiaryOps.insert(op);
+ for (Value result : op->getResults())
+ availableValues.insert(result);
+ return true;
}
LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
@@ -65,59 +106,30 @@ LogicalResult mlir::sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
llvm::SetVector<Value> sinkCandidates;
getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);
- llvm::SetVector<Value> sunkValues;
- llvm::SetVector<Operation *> sunkOperations;
- for (Value operand : sinkCandidates) {
+ SmallVector<Value, 4> worklist(sinkCandidates.begin(), sinkCandidates.end());
+ llvm::SetVector<Operation *> toBeSunk;
+ for (Value operand : worklist) {
Operation *operandOp = operand.getDefiningOp();
- if (!operandOp || !isSinkingBeneficiary(operandOp))
+ if (!operandOp)
continue;
- // Only sink operations that do not create new sinkCandidates.
- if (!llvm::all_of(operandOp->getOperands(), [&sinkCandidates](Value value) {
- return sinkCandidates.count(value);
- }))
- continue;
- sunkValues.insert(operand);
- sunkOperations.insert(operandOp);
+ extractBeneficiaryOps(operandOp, toBeSunk, sinkCandidates);
}
// Insert operations so that the defs get cloned before uses.
BlockAndValueMapping map;
OpBuilder builder(launchOpBody);
- DenseSet<Operation *> processed;
- SmallVector<Operation *, 2> clonedOps;
- while (processed.size() != sunkOperations.size()) {
- auto startSize = processed.size();
- for (Operation *sunkOperation : sunkOperations) {
- if (processed.count(sunkOperation))
- continue;
-
- // Operation cant be cloned yet if any of its operands is also being sunk,
- // but isnt cloned yet.
- if (llvm::any_of(
- sunkOperation->getOperands(), [&sunkValues, &map](Value value) {
- return sunkValues.count(value) && !map.lookupOrNull(value);
- }))
- continue;
-
- Operation *clonedOp = builder.clone(*sunkOperation, map);
- // Only replace uses within the launch op.
- for (auto result : llvm::enumerate(sunkOperation->getResults())) {
- auto replacement = clonedOp->getResult(result.index());
- for (auto &use : llvm::make_early_inc_range(result.value().getUses()))
- if (use.getOwner()->getParentOfType<gpu::LaunchOp>() == launchOp)
- use.set(replacement);
- }
- processed.insert(sunkOperation);
- }
- if (startSize == processed.size())
- return launchOp.emitError(
- "found illegal cyclic dependency between operations while sinking");
+ for (Operation *op : toBeSunk) {
+ Operation *clonedOp = builder.clone(*op, map);
+ // Only replace uses within the launch op.
+ for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
+ replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
+ launchOp.body());
}
return success();
}
-// Outline the `gpu.launch` operation body into a kernel function. Replace
-// `gpu.terminator` operations by `gpu.return` in the generated function.
+/// Outline the `gpu.launch` operation body into a kernel function. Replace
+/// `gpu.terminator` operations by `gpu.return` in the generated function.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
StringRef kernelFnName,
llvm::SetVector<Value> &operands) {
@@ -191,9 +203,9 @@ gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
return funcOp;
}
-// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
-// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
-// constant region arguments inlined.
+/// Replace `gpu.launch` operations with an `gpu.launch_func` operation
+/// launching `kernelFunc`. The kernel func contains the body of the
+/// `gpu.launch` with constant region arguments inlined.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
gpu::GPUFuncOp kernelFunc,
ValueRange operands) {
@@ -257,7 +269,7 @@ class GpuKernelOutliningPass
}
private:
- // Returns a gpu.module containing kernelFunc and all callees (recursive).
+ /// Returns a gpu.module containing kernelFunc and all callees (recursive).
gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
const SymbolTable &parentSymbolTable) {
// TODO: This code cannot use an OpBuilder because it must be inserted into
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 23a8b9d98881..d43bbc2eb992 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -60,7 +60,7 @@ func @launch() {
// -----
// CHECK: module attributes {gpu.container_module}
-
+// CHECK-LABEL: @multiple_launches
func @multiple_launches() {
// CHECK: %[[CST:.*]] = constant 8 : index
%cst = constant 8 : index
@@ -88,13 +88,66 @@ func @multiple_launches() {
// -----
-func @extra_constants(%arg0 : memref<?xf32>) {
+// CHECK-LABEL: @extra_constants_not_inlined
+func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
+ // CHECK: %[[CST:.*]] = constant 8 : index
+ %cst = constant 8 : index
+ %cst2 = constant 2 : index
+ %c0 = constant 0 : index
+ %cst3 = "secret_constant"() : () -> index
+ // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_not_inlined_kernel::@extra_constants_not_inlined_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+ %grid_z = %cst)
+ threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+ %block_z = %cst) {
+ "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+ gpu.terminator
+ }
+ return
+}
+
+// CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
+// CHECK: constant 2
+
+// -----
+
+// CHECK-LABEL: @extra_constants
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>
+func @extra_constants(%arg0: memref<?xf32>) {
// CHECK: %[[CST:.*]] = constant 8 : index
%cst = constant 8 : index
%cst2 = constant 2 : index
%c0 = constant 0 : index
%cst3 = dim %arg0, %c0 : memref<?xf32>
- // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %{{.*}}, %{{.*}}) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
+ // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]]) {kernel = @extra_constants_kernel::@extra_constants_kernel} : (index, index, index, index, index, index, memref<?xf32>) -> ()
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
+ %grid_z = %cst)
+ threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
+ %block_z = %cst) {
+ "use"(%cst2, %arg0, %cst3) : (index, memref<?xf32>, index) -> ()
+ gpu.terminator
+ }
+ return
+}
+
+// CHECK-LABEL: func @extra_constants_kernel
+// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
+// CHECK: constant 2
+// CHECK: constant 0
+// CHECK: dim %[[KARG0]]
+
+// -----
+
+// CHECK-LABEL: @extra_constants_noarg
+// CHECK-SAME: %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>
+func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
+ // CHECK: %[[CST:.*]] = constant 8 : index
+ %cst = constant 8 : index
+ %cst2 = constant 2 : index
+ %c0 = constant 0 : index
+ // CHECK: dim %[[ARG1]]
+ %cst3 = dim %arg1, %c0 : memref<?xf32>
+ // CHECK: "gpu.launch_func"(%[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[CST]], %[[ARG0]], %{{.*}}) {kernel = @extra_constants_noarg_kernel::@extra_constants_noarg_kernel} : (index, index, index, index, index, index, memref<?xf32>, index) -> ()
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
%grid_z = %cst)
threads(%tx, %ty, %tz) in (%block_x = %cst, %block_y = %cst,
@@ -105,9 +158,10 @@ func @extra_constants(%arg0 : memref<?xf32>) {
return
}
-// CHECK-LABEL: func @extra_constants_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
-// CHECK: constant
-// CHECK: constant
+// CHECK-LABEL: func @extra_constants_noarg_kernel
+// CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
+// CHECK: %[[KCST:.*]] = constant 2
+// CHECK: "use"(%[[KCST]], %[[KARG0]], %[[KARG1]])
// -----
@@ -135,6 +189,7 @@ func @multiple_uses(%arg0 : memref<?xf32>) {
llvm.mlir.global internal @global(42 : i64) : !llvm.i64
+//CHECK-LABEL: @function_call
func @function_call(%arg0 : memref<?xf32>) {
%cst = constant 8 : index
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst,
More information about the Mlir-commits
mailing list