[Mlir-commits] [mlir] 12f55ca - [MLIR][GPU] Add canonicalizer for gpu.memcpy
Uday Bondhugula
llvmlistbot at llvm.org
Tue Apr 19 05:24:58 PDT 2022
Author: Arnab Dutta
Date: 2022-04-19T17:54:00+05:30
New Revision: 12f55cac69d8978d1c433756a8b2114bf9ed1e1b
URL: https://github.com/llvm/llvm-project/commit/12f55cac69d8978d1c433756a8b2114bf9ed1e1b
DIFF: https://github.com/llvm/llvm-project/commit/12f55cac69d8978d1c433756a8b2114bf9ed1e1b.diff
LOG: [MLIR][GPU] Add canonicalizer for gpu.memcpy
Fold away gpu.memcpy op when only uses of dest are
the memcpy op in question, its allocation and deallocation
ops.
Reviewed By: bondhugula
Differential Revision: https://reviews.llvm.org/D121279
Added:
Modified:
mlir/include/mlir/Dialect/GPU/GPUOps.td
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
mlir/test/Dialect/GPU/canonicalize.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index f93a32384becc..b8069897ca142 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -999,6 +999,7 @@ def GPU_MemcpyOp : GPU_Op<"memcpy", [GPU_AsyncOpInterface]> {
}];
let hasFolder = 1;
let hasVerifier = 1;
+ let hasCanonicalizer = 1;
}
def GPU_MemsetOp : GPU_Op<"memset",
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 86bafff4a1b6a..95c8a352e834f 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1064,6 +1064,55 @@ static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
printer << "]";
}
+namespace {
+
+/// Erases a common case of copy ops where a destination value is used only by
+/// the copy op, alloc and dealloc ops.
+struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> {
+ using OpRewritePattern<MemcpyOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(MemcpyOp op,
+ PatternRewriter &rewriter) const override {
+ Value dest = op.dst();
+ // If `dest` is a block argument, we cannot remove `op`.
+ if (dest.isa<BlockArgument>())
+ return failure();
+ auto isDeallocLikeOpActingOnVal = [](Operation *op, Value val) {
+ auto memOp = dyn_cast<MemoryEffectOpInterface>(op);
+ if (!memOp)
+ return false;
+ llvm::SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>, 4>
+ memOpEffects;
+ memOp.getEffects(memOpEffects);
+ return llvm::none_of(memOpEffects, [val](auto &effect) {
+ return effect.getValue() == val &&
+ !isa<MemoryEffects::Free>(effect.getEffect());
+ });
+ };
+ // We can erase `op` iff `dest` has no other use apart from its
+ // use by `op` and dealloc ops.
+ if (llvm::any_of(dest.getUsers(), [isDeallocLikeOpActingOnVal, op,
+ dest](Operation *user) {
+ return user != op && !isDeallocLikeOpActingOnVal(user, dest);
+ }))
+ return failure();
+
+ if (op.asyncDependencies().size() > 1 ||
+ ((op.asyncDependencies().empty() && op.asyncToken()) ||
+ (!op.asyncDependencies().empty() && !op.asyncToken())))
+ return failure();
+ rewriter.replaceOp(op, op.asyncDependencies());
+ return success();
+ }
+};
+
+} // end anonymous namespace
+
+void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results,
+ MLIRContext *context) {
+ results.add<EraseTrivialCopyOp>(context);
+}
+
//===----------------------------------------------------------------------===//
// GPU_SubgroupMmaLoadMatrixOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/canonicalize.mlir b/mlir/test/Dialect/GPU/canonicalize.mlir
index 225246be49b80..979095b7f41e2 100644
--- a/mlir/test/Dialect/GPU/canonicalize.mlir
+++ b/mlir/test/Dialect/GPU/canonicalize.mlir
@@ -28,6 +28,60 @@ func @fold_wait_op_test2(%arg0: i1) -> (memref<5xf16>, memref<5xf16>) {
// CHECK-NEXT: gpu.alloc async [%[[TOKEN1]]] ()
// CHECK-NEXT: return
+// CHECK-LABEL: func @fold_memcpy_op
+func @fold_memcpy_op(%arg0: i1) {
+ %cst = arith.constant 0.000000e+00 : f16
+ %1 = memref.alloc() : memref<2xf16>
+ %2 = gpu.wait async
+ %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
+ gpu.wait [%2]
+ affine.store %cst, %memref[0] : memref<2xf16>
+ %3 = gpu.wait async
+ %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
+ gpu.wait [%3]
+ %5 = scf.if %arg0 -> (i1) {
+ memref.dealloc %1 : memref<2xf16>
+ scf.yield %arg0 : i1
+ } else {
+ memref.dealloc %1 : memref<2xf16>
+ scf.yield %arg0 : i1
+ }
+ return
+}
+// CHECK-NOT: gpu.memcpy
+
+// We cannot fold memcpy here as dest is a block argument.
+// CHECK-LABEL: func @do_not_fold_memcpy_op1
+func @do_not_fold_memcpy_op1(%arg0: i1, %arg1: memref<2xf16>) {
+ %cst = arith.constant 0.000000e+00 : f16
+ %2 = gpu.wait async
+ %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
+ gpu.wait [%2]
+ affine.store %cst, %memref[0] : memref<2xf16>
+ %3 = gpu.wait async
+ %4 = gpu.memcpy async [%3] %arg1, %memref : memref<2xf16>, memref<2xf16>
+ gpu.wait [%3]
+ return
+}
+// CHECK: gpu.memcpy
+
+// We cannot fold gpu.memcpy as it is used by an op having read effect on dest.
+// CHECK-LABEL: func @do_not_fold_memcpy_op2
+func @do_not_fold_memcpy_op2(%arg0: i1, %arg1: index) -> f16 {
+ %cst = arith.constant 0.000000e+00 : f16
+ %1 = memref.alloc() : memref<2xf16>
+ %2 = gpu.wait async
+ %memref, %asyncToken = gpu.alloc async [%2] () : memref<2xf16>
+ gpu.wait [%2]
+ affine.store %cst, %memref[0] : memref<2xf16>
+ %3 = gpu.wait async
+ %4 = gpu.memcpy async [%3] %1, %memref : memref<2xf16>, memref<2xf16>
+ gpu.wait [%3]
+ %5 = memref.load %1[%arg1] : memref<2xf16>
+ return %5 : f16
+}
+// CHECK: gpu.memcpy
+
// CHECK-LABEL: @memcpy_after_cast
func @memcpy_after_cast(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
// CHECK-NOT: memref.cast
More information about the Mlir-commits
mailing list