[Mlir-commits] [mlir] 59e4fbf - [mlir][memref] Fold subview into GPU subgroup MMA load/store ops
Lei Zhang
llvmlistbot at llvm.org
Wed Mar 15 10:58:51 PDT 2023
Author: Lei Zhang
Date: 2023-03-15T17:49:32Z
New Revision: 59e4fbfcd064032db66594617279e5fe726fd60e
URL: https://github.com/llvm/llvm-project/commit/59e4fbfcd064032db66594617279e5fe726fd60e
DIFF: https://github.com/llvm/llvm-project/commit/59e4fbfcd064032db66594617279e5fe726fd60e.diff
LOG: [mlir][memref] Fold subview into GPU subgroup MMA load/store ops
This commits adds support for folding subview into GPU subgroup
MMA load/store ops.
Reviewed By: ThomasRaoux
Differential Revision: https://reviews.llvm.org/D146150
Added:
Modified:
mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt
mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt b/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt
index 3cffe05995f0c..744f5c647b228 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/MemRef/Transforms/CMakeLists.txt
@@ -23,6 +23,7 @@ add_mlir_dialect_library(MLIRMemRefTransforms
MLIRArithTransforms
MLIRBufferizationDialect
MLIRFuncDialect
+ MLIRGPUOps
MLIRInferTypeOpInterface
MLIRLoopLikeInterface
MLIRMemRefDialect
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 73264bc76933a..89b34fc14fe42 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -216,6 +217,14 @@ static Value getMemRefOperand(vector::TransferWriteOp op) {
return op.getSource();
}
+static Value getMemRefOperand(gpu::SubgroupMmaLoadMatrixOp op) {
+ return op.getSrcMemref();
+}
+
+static Value getMemRefOperand(gpu::SubgroupMmaStoreMatrixOp op) {
+ return op.getDstMemref();
+}
+
/// Given the permutation map of the original
/// `vector.transfer_read`/`vector.transfer_write` operations compute the
/// permutation map to use after the subview is folded with it.
@@ -407,6 +416,11 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
transferReadOp.getPadding(),
/*mask=*/Value(), transferReadOp.getInBoundsAttr());
})
+ .Case([&](gpu::SubgroupMmaLoadMatrixOp op) {
+ rewriter.replaceOpWithNewOp<gpu::SubgroupMmaLoadMatrixOp>(
+ op, op.getType(), subViewOp.getSource(), sourceIndices,
+ op.getLeadDimension(), op.getTransposeAttr());
+ })
.Default([](Operation *) { llvm_unreachable("unexpected operation."); });
return success();
}
@@ -502,11 +516,11 @@ LogicalResult StoreOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
llvm::TypeSwitch<Operation *, void>(storeOp)
.Case([&](AffineStoreOp op) {
rewriter.replaceOpWithNewOp<AffineStoreOp>(
- storeOp, storeOp.getValue(), subViewOp.getSource(), sourceIndices);
+ op, op.getValue(), subViewOp.getSource(), sourceIndices);
})
.Case([&](memref::StoreOp op) {
rewriter.replaceOpWithNewOp<memref::StoreOp>(
- storeOp, storeOp.getValue(), subViewOp.getSource(), sourceIndices,
+ op, op.getValue(), subViewOp.getSource(), sourceIndices,
op.getNontemporal());
})
.Case([&](vector::TransferWriteOp op) {
@@ -516,6 +530,11 @@ LogicalResult StoreOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
op.getPermutationMap()),
op.getInBoundsAttr());
})
+ .Case([&](gpu::SubgroupMmaStoreMatrixOp op) {
+ rewriter.replaceOpWithNewOp<gpu::SubgroupMmaStoreMatrixOp>(
+ op, op.getSrc(), subViewOp.getSource(), sourceIndices,
+ op.getLeadDimension(), op.getTransposeAttr());
+ })
.Default([](Operation *) { llvm_unreachable("unexpected operation."); });
return success();
}
@@ -590,9 +609,11 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
patterns.add<LoadOpOfSubViewOpFolder<AffineLoadOp>,
LoadOpOfSubViewOpFolder<memref::LoadOp>,
LoadOpOfSubViewOpFolder<vector::TransferReadOp>,
+ LoadOpOfSubViewOpFolder<gpu::SubgroupMmaLoadMatrixOp>,
StoreOpOfSubViewOpFolder<AffineStoreOp>,
StoreOpOfSubViewOpFolder<memref::StoreOp>,
StoreOpOfSubViewOpFolder<vector::TransferWriteOp>,
+ StoreOpOfSubViewOpFolder<gpu::SubgroupMmaStoreMatrixOp>,
LoadOpOfExpandShapeOpFolder<AffineLoadOp>,
LoadOpOfExpandShapeOpFolder<memref::LoadOp>,
StoreOpOfExpandShapeOpFolder<AffineStoreOp>,
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index f0d5008f991aa..e0e8bd48d6f86 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -524,3 +524,54 @@ func.func @fold_store_keep_nontemporal(%arg0 : memref<12x32xf32>, %arg1 : index,
memref.store %arg5, %0[%arg3, %arg4] {nontemporal=true}: memref<4x4xf32, strided<[64, 3], offset: ?>>
return
}
+
+// -----
+
+func.func @fold_gpu_subgroup_mma_load_matrix_1d(%src: memref<?xvector<4xf32>>, %offset: index, %i: index) -> !gpu.mma_matrix<16x16xf16, "COp"> {
+ %subview = memref.subview %src[%offset] [81920] [1] : memref<?xvector<4xf32>> to memref<81920xvector<4xf32>, strided<[1], offset: ?>>
+ %matrix = gpu.subgroup_mma_load_matrix %subview[%i] {leadDimension = 160 : index} : memref<81920xvector<4xf32>, strided<[1], offset: ?>> -> !gpu.mma_matrix<16x16xf16, "COp">
+ return %matrix: !gpu.mma_matrix<16x16xf16, "COp">
+}
+
+// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK: func.func @fold_gpu_subgroup_mma_load_matrix_1d
+// CHECK-SAME: (%[[SRC:.+]]: memref<?xvector<4xf32>>, %[[OFFSET:.+]]: index, %[[I:.+]]: index)
+// CHECK: %[[APPLY:.+]] = affine.apply #[[MAP]](%[[I]])[%[[OFFSET]]]
+// CHECK: %[[LOAD:.+]] = gpu.subgroup_mma_load_matrix %[[SRC]][%[[APPLY]]] {leadDimension = 160 : index} : memref<?xvector<4xf32>> -> !gpu.mma_matrix<16x16xf16, "COp">
+// CHECK: return %[[LOAD]]
+
+// -----
+
+func.func @fold_gpu_subgroup_mma_store_matrix_1d(%dst: memref<?xvector<4xf32>>, %offset: index, %i: index, %matrix: !gpu.mma_matrix<16x16xf16, "COp">) {
+ %subview = memref.subview %dst[%offset] [81920] [1] : memref<?xvector<4xf32>> to memref<81920xvector<4xf32>, strided<[1], offset: ?>>
+ gpu.subgroup_mma_store_matrix %matrix, %subview[%i] {leadDimension = 160 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<81920xvector<4xf32>, strided<[1], offset: ?>>
+ return
+}
+
+// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK: func.func @fold_gpu_subgroup_mma_store_matrix_1d
+// CHECK-SAME: (%[[DST:.+]]: memref<?xvector<4xf32>>, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[VAL:.+]]: !gpu.mma_matrix<16x16xf16, "COp">)
+// CHECK: %[[APPLY:.+]] = affine.apply #[[MAP]](%[[I0]])[%[[OFFSET]]]
+// CHECK: gpu.subgroup_mma_store_matrix %[[VAL]], %[[DST]][%[[APPLY]]] {leadDimension = 160 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<?xvector<4xf32>>
+
+// -----
+
+// CHECK-LABEL: func.func @fold_gpu_subgroup_mma_load_matrix_2d
+// CHECK-SAME: %[[SRC:.+]]: memref<128x128xf32>
+func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) -> !gpu.mma_matrix<16x16xf16, "COp"> {
+ %subview = memref.subview %arg0[%arg1, %arg2][64, 32][2, 1] : memref<128x128xf32> to memref<64x32xf32, strided<[64, 1], offset: ?>>
+ // CHECK: gpu.subgroup_mma_load_matrix %[[SRC]][{{.+}}] {leadDimension = 32 : index} : memref<128x128xf32> -> !gpu.mma_matrix<16x16xf16, "COp">
+ %matrix = gpu.subgroup_mma_load_matrix %subview[%arg3, %arg4] {leadDimension = 32 : index} : memref<64x32xf32, strided<[64, 1], offset: ?>> -> !gpu.mma_matrix<16x16xf16, "COp">
+ return %matrix : !gpu.mma_matrix<16x16xf16, "COp">
+}
+
+// -----
+
+// CHECK-LABEL: func.func @fold_gpu_subgroup_mma_load_matrix_2d
+// CHECK-SAME: %[[DST:.+]]: memref<128x128xf32>
+func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %matrix: !gpu.mma_matrix<16x16xf16, "COp">) {
+ %subview = memref.subview %arg0[%arg1, %arg2][64, 32][2, 1] : memref<128x128xf32> to memref<64x32xf32, strided<[64, 1], offset: ?>>
+ // CHECK: gpu.subgroup_mma_store_matrix %{{.+}}, %[[DST]][{{.+}}] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<128x128xf32>
+ gpu.subgroup_mma_store_matrix %matrix, %subview[%arg3, %arg4] {leadDimension = 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<64x32xf32, strided<[64, 1], offset: ?>>
+ return
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b0ea6bc3e2358..45a75fe6b977e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9954,6 +9954,7 @@ cc_library(
":ControlFlowDialect",
":DialectUtils",
":FuncDialect",
+ ":GPUDialect",
":IR",
":InferTypeOpInterface",
":LoopLikeInterface",
More information about the Mlir-commits
mailing list