[Mlir-commits] [mlir] [mlir][gpu] Introduce the `gpu.conditional_execution` op (PR #78013)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Jan 12 18:00:33 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-gpu
@llvm/pr-subscribers-mlir
Author: Fabian Mora (fabianmcg)
<details>
<summary>Changes</summary>
This pass add the `gpu.conditional_execution`. This operation allows selecting
host or device code depending in the execution context.
For example:
```
func.func @<!-- -->conditional_execution(%dev: index, %host: index) {
%0 = gpu.conditional_execution device {
gpu.yield %dev : index
} host {
gpu.yield %host : index
} -> index
return
}
// mlir-opt --gpu-resolve-conditional-execution
func.func @<!-- -->conditional_execution(%dev: index, %host: index) {
%0 = scf.execute_region -> index {
scf.yield %host : index
}
return
}
```
This is a helpful operation combined with `gpu.launch`, as the kernel outlining
pass copies full symbols when outlining. Before this patch, functions called
from inside a launch op couldn't easily contain GPU operations -if the function
contained GPU ops, it had to be removed from the host module.
Note: Ignore the YieldOp commit; it's under review in #<!-- -->78006. Once that one gets
through, I'll remove the base commit and this note.
---
Full diff: https://github.com/llvm/llvm-project/pull/78013.diff
10 Files Affected:
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h (+1)
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+44-1)
- (modified) mlir/include/mlir/Dialect/GPU/Transforms/Passes.h (+4)
- (modified) mlir/include/mlir/Dialect/GPU/Transforms/Passes.td (+31)
- (modified) mlir/lib/Dialect/GPU/CMakeLists.txt (+2)
- (modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+40)
- (added) mlir/lib/Dialect/GPU/Transforms/ResolveConditionalExecution.cpp (+95)
- (modified) mlir/test/Dialect/GPU/invalid.mlir (+21)
- (modified) mlir/test/Dialect/GPU/ops.mlir (+26)
- (added) mlir/test/Dialect/GPU/resolve-conditional-execution.mlir (+78)
``````````diff
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 58c0719c6a410c..96e1935bd0a841 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -23,6 +23,7 @@
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/InferIntRangeInterface.h"
#include "mlir/Interfaces/InferTypeOpInterface.h"
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 8d4a110ee801f0..591ce25c9d8e8a 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -22,6 +22,7 @@ include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
include "mlir/IR/CommonTypeConstraints.td"
include "mlir/IR/EnumAttr.td"
include "mlir/IR/SymbolInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
include "mlir/Interfaces/DataLayoutInterfaces.td"
include "mlir/Interfaces/FunctionInterfaces.td"
include "mlir/Interfaces/InferIntRangeInterface.td"
@@ -961,7 +962,7 @@ def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">,
let assemblyFormat = "attr-dict";
}
-def GPU_YieldOp : GPU_Op<"yield", [Pure, Terminator]>,
+def GPU_YieldOp : GPU_Op<"yield", [Pure, ReturnLike, Terminator]>,
Arguments<(ins Variadic<AnyType>:$values)> {
let summary = "GPU yield operation";
let description = [{
@@ -974,6 +975,8 @@ def GPU_YieldOp : GPU_Op<"yield", [Pure, Terminator]>,
gpu.yield %f0, %f1 : f32, f32
```
}];
+
+ let assemblyFormat = "attr-dict ($values^ `:` type($values))?";
}
// These mirror the reduction combining kinds from the vector dialect.
@@ -2724,4 +2727,44 @@ def GPU_SetCsrPointersOp : GPU_Op<"set_csr_pointers", [GPU_AsyncOpInterface]> {
}];
}
+def GPU_ConditionalExecutionOp : GPU_Op<"conditional_execution", [
+ DeclareOpInterfaceMethods<RegionBranchOpInterface>
+ ]> {
+ let summary = "Executes a region of code based on the surrounding context.";
+ let description = [{
+ The `conditional_execution` operation executes a region of host or device
+ code depending on the surrounding execution context of the operation. If
+ the operation is inside a GPU module or launch operation, it executes the
+ device region; otherwise, it runs the host region.
+
+ This operation can yield a variadic set of results. If the operation yields
+ results, then both regions have to be present. However, if there are no
+ results, then it's valid to implement only one of the regions.
+
+ Examples:
+ ```mlir
+ // Conditional exeution with results.
+ %res = gpu.conditional_execution device {
+ ...
+ gpu.yield %val : i32
+ } host {
+ ...
+ gpu.yield %val : i32
+ } -> i32
+ // Conditional exeution with no results and only the host region.
+ gpu.conditional_execution host {
+ ...
+ gpu.yield
+ }
+ ```
+ }];
+ let results = (outs Variadic<AnyType>:$results);
+ let regions = (region AnyRegion:$hostRegion, AnyRegion:$deviceRegion);
+ let assemblyFormat = [{
+ (`device` $deviceRegion^)? (`host` $hostRegion^)? attr-dict
+ (`->` type($results)^)?
+ }];
+ let hasVerifier = 1;
+}
+
#endif // GPU_OPS
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 5885facd07541e..62c06cc604aef3 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -63,6 +63,10 @@ void populateGpuShufflePatterns(RewritePatternSet &patterns);
/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
void populateGpuAllReducePatterns(RewritePatternSet &patterns);
+/// Collect a set of patterns to rewrite conditional-execution ops within the
+/// GPU dialect.
+void populateGpuConditionalExecutionPatterns(RewritePatternSet &patterns);
+
/// Collect a set of patterns to break down subgroup_reduce ops into smaller
/// ones supported by the target of `size <= maxShuffleBitwidth`, where `size`
/// is the subgroup_reduce value bitwidth.
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 3e0f6a3022f935..c694af71296de6 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -250,4 +250,35 @@ def GpuSPIRVAttachTarget: Pass<"spirv-attach-target", ""> {
];
}
+def GpuResolveConditionalExecutionPass :
+ Pass<"gpu-resolve-conditional-execution", ""> {
+ let summary = "Resolve all conditional execution operations";
+ let description = [{
+ This pass searches for all `gpu.conditional_execution` operations and
+ inlines the appropriate region depending on the execution context. If the
+ operation is inside any of the [`gpu.module`, `gpu.func`, `gpu.launch`]
+ operations, then the pass inlines the device region; otherwise, it
+ inlines the host region.
+ Example:
+ ```
+ func.func @conditional_execution(%dev: index, %host: index) {
+ %0 = gpu.conditional_execution device {
+ gpu.yield %dev : index
+ } host {
+ gpu.yield %host : index
+ } -> index
+ return
+ }
+ // mlir-opt --gpu-resolve-conditional-execution
+ func.func @conditional_execution(%dev: index, %host: index) {
+ %0 = scf.execute_region -> index {
+ scf.yield %host : index
+ }
+ return
+ }
+ ```
+ }];
+ let dependentDialects = ["scf::SCFDialect"];
+}
+
#endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 8f289ce9452e80..9692bda34269db 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -37,6 +37,7 @@ add_mlir_dialect_library(MLIRGPUDialect
LINK_LIBS PUBLIC
MLIRArithDialect
MLIRDLTIDialect
+ MLIRControlFlowInterfaces
MLIRFunctionInterfaces
MLIRInferIntRangeInterface
MLIRIR
@@ -57,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
Transforms/ModuleToBinary.cpp
Transforms/NVVMAttachTarget.cpp
Transforms/ParallelLoopMapper.cpp
+ Transforms/ResolveConditionalExecution.cpp
Transforms/ROCDLAttachTarget.cpp
Transforms/SerializeToBlob.cpp
Transforms/SerializeToCubin.cpp
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 020900934c9f72..ef8f3f80a2f553 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2204,6 +2204,46 @@ LogicalResult gpu::DynamicSharedMemoryOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// ConditionalExecutionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ConditionalExecutionOp::verify() {
+ Region &devRegion = getDeviceRegion();
+ Region &hostRegion = getHostRegion();
+ if (devRegion.empty() && hostRegion.empty())
+ return emitError("both regions can't be empty");
+ if (getResults().size() > 0 && (devRegion.empty() || hostRegion.empty()))
+ return emitError(
+ "when there are results both regions have to be specified");
+ if ((!devRegion.empty() &&
+ !mlir::isa<YieldOp>(devRegion.back().getTerminator())) ||
+ (!hostRegion.empty() &&
+ !mlir::isa<YieldOp>(hostRegion.back().getTerminator()))) {
+ return emitError(
+ "conditional execution regions must terminate with gpu.yield");
+ }
+ return success();
+}
+
+void ConditionalExecutionOp::getSuccessorRegions(
+ RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> ®ions) {
+ // Both sub-regions always return to the parent.
+ if (!point.isParent()) {
+ regions.push_back(RegionSuccessor(getResults()));
+ return;
+ }
+
+ Region &devRegion = getDeviceRegion();
+ Region &hostRegion = getHostRegion();
+
+ // Don't consider the regions if they are empty.
+ regions.push_back(devRegion.empty() ? RegionSuccessor()
+ : RegionSuccessor(&devRegion));
+ regions.push_back(hostRegion.empty() ? RegionSuccessor()
+ : RegionSuccessor(&hostRegion));
+}
+
//===----------------------------------------------------------------------===//
// GPU target options
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/GPU/Transforms/ResolveConditionalExecution.cpp b/mlir/lib/Dialect/GPU/Transforms/ResolveConditionalExecution.cpp
new file mode 100644
index 00000000000000..6861a66435ba12
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/ResolveConditionalExecution.cpp
@@ -0,0 +1,95 @@
+//===- ResolveConditionalExecution.cpp - Resolve conditional exec ops ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the `gpu-resolve-conditional-execution` pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+namespace mlir {
+#define GEN_PASS_DEF_GPURESOLVECONDITIONALEXECUTIONPASS
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+namespace {
+class GpuResolveConditionalExecutionPass
+ : public impl::GpuResolveConditionalExecutionPassBase<
+ GpuResolveConditionalExecutionPass> {
+public:
+ using Base::Base;
+ void runOnOperation() final;
+};
+} // namespace
+
+void GpuResolveConditionalExecutionPass::runOnOperation() {
+ RewritePatternSet patterns(&getContext());
+ mlir::populateGpuConditionalExecutionPatterns(patterns);
+ if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+ return signalPassFailure();
+}
+
+namespace {
+struct GpuConditionalExecutionOpRewriter
+ : public OpRewritePattern<ConditionalExecutionOp> {
+ using OpRewritePattern<ConditionalExecutionOp>::OpRewritePattern;
+ // Check whether the operation is inside a device execution context.
+ bool isDevice(Operation *op) const {
+ while ((op = op->getParentOp()))
+ if (isa<GPUFuncOp, LaunchOp, GPUModuleOp>(op))
+ return true;
+ return false;
+ }
+ LogicalResult matchAndRewrite(ConditionalExecutionOp op,
+ PatternRewriter &rewriter) const override {
+ bool isDev = isDevice(op);
+ // Remove the op if the device region is empty and we are in a device
+ // context.
+ if (isDev && op.getDeviceRegion().empty()) {
+ rewriter.eraseOp(op);
+ return success();
+ }
+ // Remove the op if the host region is empty and we are in a host context.
+ if (!isDev && op.getHostRegion().empty()) {
+ rewriter.eraseOp(op);
+ return success();
+ }
+ // Replace `ConditionalExecutionOp` with a `scf::ExecuteRegionOp`.
+ auto execRegionOp = rewriter.create<scf::ExecuteRegionOp>(
+ op.getLoc(), op.getResults().getTypes());
+ if (isDev)
+ rewriter.inlineRegionBefore(op.getDeviceRegion(),
+ execRegionOp.getRegion(),
+ execRegionOp.getRegion().begin());
+ else
+ rewriter.inlineRegionBefore(op.getHostRegion(), execRegionOp.getRegion(),
+ execRegionOp.getRegion().begin());
+ rewriter.eraseOp(op);
+ // This is safe because `ConditionalExecutionOp` always terminates with
+ // `gpu::YieldOp`
+ auto yieldOp =
+ dyn_cast<YieldOp>(execRegionOp.getRegion().back().getTerminator());
+ rewriter.setInsertionPoint(yieldOp);
+ rewriter.replaceOpWithNewOp<scf::YieldOp>(yieldOp, yieldOp.getValues());
+ return success();
+ }
+};
+} // namespace
+
+void mlir::populateGpuConditionalExecutionPatterns(
+ RewritePatternSet &patterns) {
+ patterns.add<GpuConditionalExecutionOpRewriter>(patterns.getContext());
+}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 4d3a898fdd1565..920cca98296eb7 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -818,3 +818,24 @@ func.func @main(%arg0 : index) {
return
}
+// -----
+
+func.func @conditional_execution(%sz : index) {
+ // @expected-error at +1 {{when there are results both regions have to be specified}}
+ %val = gpu.conditional_execution device {
+ gpu.yield %sz: index
+ } -> index
+ return
+}
+
+// -----
+
+func.func @conditional_execution(%sz : index) {
+ // @expected-error at +1 {{'gpu.conditional_execution' op region control flow edge from Region #0 to parent results: source has 0 operands, but target successor needs 1}}
+ %val = gpu.conditional_execution device {
+ gpu.yield %sz: index
+ } host {
+ gpu.yield
+ } -> index
+ return
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 60512424383052..cccaa39c22834a 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -94,6 +94,17 @@ module attributes {gpu.container_module} {
// CHECK-NEXT: } : (f32) -> f32
%sum1 = gpu.all_reduce add %one uniform {} : (f32) -> f32
+ // CHECK: %{{.*}} = gpu.all_reduce %{{.*}} {
+ // CHECK-NEXT: ^{{.*}}(%{{.*}}: f32, %{{.*}}: f32):
+ // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
+ // CHECK-NEXT: gpu.yield %{{.*}} : f32
+ // CHECK-NEXT: } : (f32) -> f32
+ %sum2 = gpu.all_reduce %one {
+ ^bb(%lhs : f32, %rhs : f32):
+ %tmp = arith.addf %lhs, %rhs : f32
+ gpu.yield %tmp : f32
+ } : (f32) -> (f32)
+
// CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (f32) -> f32
%sum_subgroup = gpu.subgroup_reduce add %one : (f32) -> f32
@@ -412,3 +423,18 @@ gpu.module @module_with_two_target [#nvvm.target, #rocdl.target<chip = "gfx90a">
gpu.return
}
}
+
+func.func @conditional_execution(%sz : index) {
+ %val = gpu.conditional_execution device {
+ gpu.yield %sz: index
+ } host {
+ gpu.yield %sz: index
+ } -> index
+ gpu.conditional_execution device {
+ gpu.yield
+ }
+ gpu.conditional_execution host {
+ gpu.yield
+ }
+ return
+}
diff --git a/mlir/test/Dialect/GPU/resolve-conditional-execution.mlir b/mlir/test/Dialect/GPU/resolve-conditional-execution.mlir
new file mode 100644
index 00000000000000..5c7420db374a55
--- /dev/null
+++ b/mlir/test/Dialect/GPU/resolve-conditional-execution.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-opt %s --gpu-resolve-conditional-execution -split-input-file | FileCheck %s
+
+// CHECK-LABEL:func.func @conditional_execution_host
+// CHECK: (%[[DEV:.*]]: index, %[[HOST:.*]]: index)
+func.func @conditional_execution_host(%dev : index, %host : index) {
+ // CHECK: %{{.*}} = scf.execute_region -> index {
+ // CHECK-NEXT: scf.yield %[[HOST]] : index
+ // CHECK-NEXT: }
+ // CHECK-NEXT: return
+ // Test that it returns %host.
+ %v = gpu.conditional_execution device {
+ gpu.yield %dev: index
+ } host {
+ gpu.yield %host: index
+ } -> index
+ return
+}
+
+// -----
+
+// CHECK-LABEL:func.func @conditional_execution_host
+func.func @conditional_execution_host(%memref: memref<f32>) {
+ // CHECK-NEXT: return
+ // CHECK-NEXT: }
+ // Test that the operation gets erased.
+ gpu.conditional_execution device {
+ %c1 = arith.constant 1.0 : f32
+ memref.store %c1, %memref[] : memref<f32>
+ gpu.yield
+ }
+ return
+}
+
+// -----
+
+gpu.module @conditional_execution_dev {
+// CHECK-LABEL:gpu.func @kernel
+// CHECK: (%[[DEV:.*]]: index, %[[HOST:.*]]: index)
+ gpu.func @kernel(%dev : index, %host : index) kernel {
+ // CHECK: %{{.*}} = scf.execute_region -> index {
+ // CHECK-NEXT: scf.yield %[[DEV]] : index
+ // CHECK-NEXT: }
+ // CHECK-NEXT: return
+ // Test that it returns %dev.
+ %v = gpu.conditional_execution device {
+ gpu.yield %dev: index
+ } host {
+ gpu.yield %host: index
+ } -> index
+ gpu.return
+ }
+}
+
+// -----
+
+// CHECK-LABEL:func.func @conditional_execution_dev
+// CHECK: (%[[MEMREF:.*]]: memref<f32>, %[[DEV:.*]]: f32, %[[HOST:.*]]: f32)
+func.func @conditional_execution_dev(%memref: memref<f32>, %fdev: f32, %fhost: f32) {
+ %c1 = arith.constant 1 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1)
+ threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) {
+ // CHECK: scf.execute_region {
+ // CHECK-NEXT: memref.store %[[DEV]], %[[MEMREF]][] : memref<f32>
+ // CHECK-NEXT: scf.yield
+ // CHECK-NEXT: }
+ // CHECK-NEXT: gpu.terminator
+ // Test that it uses %fdev.
+ gpu.conditional_execution device {
+ memref.store %fdev, %memref[] : memref<f32>
+ gpu.yield
+ } host {
+ memref.store %fhost, %memref[] : memref<f32>
+ gpu.yield
+ }
+ gpu.terminator
+ }
+ return
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/78013
More information about the Mlir-commits
mailing list