[Mlir-commits] [mlir] [MLIR] Improve KernelOutlining to avoid introducing an extra block (PR #90359)
Mehdi Amini
llvmlistbot at llvm.org
Sun Apr 28 06:32:18 PDT 2024
https://github.com/joker-eph updated https://github.com/llvm/llvm-project/pull/90359
>From 2d03b72cd38f499dca697e41735bdc9007508e4b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph at gmail.com>
Date: Sat, 27 Apr 2024 14:16:11 -0700
Subject: [PATCH] [MLIR] Improve KernelOutlining to avoid introducing an extra
block
This fixes a TODO in the code.
Also add a test with a CFG in the region
---
.../GPU/Transforms/KernelOutlining.cpp | 34 +++++++++---------
mlir/test/Dialect/GPU/outlining.mlir | 35 ++++++++++++++++---
2 files changed, 49 insertions(+), 20 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 2436113dc4239c..f5e80553ae72aa 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -241,24 +241,26 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
map.map(operand.value(), entryBlock.getArgument(operand.index()));
// Clone the region of the gpu.launch operation into the gpu.func operation.
- // TODO: If cloneInto can be modified such that if a mapping for
- // a block exists, that block will be used to clone operations into (at the
- // end of the block), instead of creating a new block, this would be much
- // cleaner.
launchOpBody.cloneInto(&outlinedFuncBody, map);
- // Branch from entry of the gpu.func operation to the block that is cloned
- // from the entry block of the gpu.launch operation.
- Block &launchOpEntry = launchOpBody.front();
- Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
- builder.setInsertionPointToEnd(&entryBlock);
- builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
-
- outlinedFunc.walk([](gpu::TerminatorOp op) {
- OpBuilder replacer(op);
- replacer.create<gpu::ReturnOp>(op.getLoc());
- op.erase();
- });
+ // Replace the terminator op with returns.
+ for (Block &block : launchOpBody) {
+ Block *clonedBlock = map.lookup(&block);
+ auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->getTerminator());
+ if (!terminator)
+ continue;
+ OpBuilder replacer(terminator);
+ replacer.create<gpu::ReturnOp>(terminator->getLoc());
+ terminator->erase();
+ }
+
+ // Splice now the entry block of the gpu.launch operation at the end of the
+ // gpu.func entry block and erase the redundant block.
+ Block *clonedLaunchOpEntry = map.lookup(&launchOpBody.front());
+ entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+ clonedLaunchOpEntry->getOperations());
+ clonedLaunchOpEntry->erase();
+
return outlinedFunc;
}
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 601add9a9f91c0..915099fb4aeaa1 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -54,12 +54,41 @@ func.func @launch() {
// CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
// CHECK-NEXT: = gpu.block_dim y
// CHECK-NEXT: = gpu.block_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
// CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
+// -----
+
+// Verify that we can outline a CFG
+// CHECK-LABEL: gpu.func @launchCFG_kernel()
+// CHECK: cf.br
+// CHECK: gpu.return
+func.func @launchCFG() {
+ %0 = "op"() : () -> (f32)
+ %1 = "op"() : () -> (memref<?xf32, 1>)
+ %gDimX = arith.constant 8 : index
+ %gDimY = arith.constant 12 : index
+ %gDimZ = arith.constant 16 : index
+ %bDimX = arith.constant 20 : index
+ %bDimY = arith.constant 24 : index
+ %bDimZ = arith.constant 28 : index
+
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+ %grid_z = %gDimZ)
+ threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+ %block_z = %bDimZ) {
+ "use"(%0): (f32) -> ()
+ cf.br ^bb1
+ ^bb1:
+ "some_op"(%bx, %block_x) : (index, index) -> ()
+ %42 = memref.load %1[%tx] : memref<?xf32, 1>
+ gpu.terminator
+ }
+ return
+}
+
+
// -----
// This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func
@@ -475,8 +504,6 @@ func.func @launch_cluster() {
// CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
// CHECK-NEXT: = gpu.cluster_dim y
// CHECK-NEXT: = gpu.cluster_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
// CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
// CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
// CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
More information about the Mlir-commits
mailing list