[Mlir-commits] [mlir] 84eed78 - [Updated commit] Fix Transpose Check in MMA.SYNC Path.
Manish Gupta
llvmlistbot at llvm.org
Mon Apr 10 17:41:13 PDT 2023
Author: Manish Gupta
Date: 2023-04-11T00:38:35Z
New Revision: 84eed7843e61e8c24a03deb3dd46eeccfc906373
URL: https://github.com/llvm/llvm-project/commit/84eed7843e61e8c24a03deb3dd46eeccfc906373
DIFF: https://github.com/llvm/llvm-project/commit/84eed7843e61e8c24a03deb3dd46eeccfc906373.diff
LOG: [Updated commit] Fix Transpose Check in MMA.SYNC Path.
Pushed a stale commit for the same review in my previous commit.
I am updating the main-line with the latest commit including
review commits. Apologies for the redundant commit.
Differential Revision: https://reviews.llvm.org/D147749
Added:
Modified:
mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 10a6ee43a8f98..7d643fff3cad1 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -654,27 +654,32 @@ convertConstantOpMmaSync(RewriterBase &rewriter, arith::ConstantOp op,
/// Transposed Map Example:
/// Example 1 : (..., d0, d1) -> (d1 * 1, d0 * 2)
/// Example 2 : (d0, d1, d2, d3) -> (d3, d2)
-///
/// The code below checks if the output 2D is transposed using a generalized
/// version : (d0, d1, dn, ..., dm, ...) -> (dm, dn)
/// Returns : true; if m > n, false o.w.
-
-static bool isTransposed(vector::TransferReadOp op) {
+static FailureOr<bool> isTransposed(vector::TransferReadOp op) {
mlir::AffineMap map = op.getPermutationMap();
+
if (map.getNumResults() != 2) {
- op->emitError("Expected 2D transfer read");
+ LLVM_DEBUG(DBGS() << "Failed because the result of `vector.transfer_read` "
+ "is not a 2d operand\n");
+ return failure();
}
// Output 2D matrix dimensions in the order of d0, d1.
- auto dM = map.getResult(0);
- auto dN = map.getResult(1);
+ mlir::AffineExpr dM = map.getResult(0);
+ mlir::AffineExpr dN = map.getResult(1);
// Find the position of these expressions in the input.
auto exprM = dM.dyn_cast<AffineDimExpr>();
auto exprN = dN.dyn_cast<AffineDimExpr>();
+
if (!exprM || !exprN) {
- op->emitError("Expected to find AffineDimExpr in vector::TransferReadOp");
+ LLVM_DEBUG(DBGS() << "Failed because expressions are not affine dim "
+ "expressions, then transpose cannot be determined.\n");
+ return failure();
}
+
return exprM.getPosition() > exprN.getPosition();
}
@@ -699,9 +704,15 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
return rewriter.notifyMatchFailure(op, "not mma sync reg info");
}
+ FailureOr<bool> transpose = isTransposed(op);
+ if (failed(transpose)) {
+ LLVM_DEBUG(DBGS() << "failed to determine the transpose\n");
+ return rewriter.notifyMatchFailure(
+ op, "Op should likely not be converted to a nvgpu.ldmatrix call.");
+ }
+
FailureOr<nvgpu::LdMatrixParams> params =
- nvgpu::getLdMatrixParams(*warpMatrixInfo,
- /*transpose=*/isTransposed(op));
+ nvgpu::getLdMatrixParams(*warpMatrixInfo, *transpose);
if (failed(params)) {
LLVM_DEBUG(
@@ -727,9 +738,9 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
SmallVector<Value, 4> indices;
getXferIndices<vector::TransferReadOp>(rewriter, op, *offsets, {laneId},
indices);
+
nvgpu::LdMatrixOp newOp = rewriter.create<nvgpu::LdMatrixOp>(
- loc, vectorType, op.getSource(), indices,
- /*transpose=*/isTransposed(op), params->numTiles);
+ loc, vectorType, op.getSource(), indices, *transpose, params->numTiles);
valueMapping[op] = newOp->getResult(0);
return success();
}
diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
index 331cd85faf963..4465819fc7fe4 100644
--- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
+++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops-mma-sync.mlir
@@ -213,7 +213,6 @@ func.func @m16n8k16_fp16_row_row_row(%arg0: memref<20x20xf16, #gpu.address_space
// CHECK-LABEL: func @m16n16k16_mmasync16816_fp16_f16_row_row_row
func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<32x64xf16, #gpu.address_space<workgroup>>, %arg2: memref<42x64xf16, #gpu.address_space<workgroup>>) {
- %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16>
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
%cst = arith.constant 0.000000e+00 : f16
@@ -253,6 +252,55 @@ func.func @m16n16k16_mmasync16816_fp16_f16_row_row_row(%arg0: memref<42x32xf16,
}
// -----
+//#################################################################################################################
+// FP16 row-row-row (Determine the transpose for multi-dimensional vector.transfer_read in vector-to-gpu lowering)
+//#################################################################################################################
+
+// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
+// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
+
+#map0 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map_a = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
+#map_b = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
+
+// CHECK-LABEL: func @multi_dim_m16n8k16_fp16_row_row_row
+func.func @multi_dim_m16n8k16_fp16_row_row_row(%arg0: memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, %arg1: memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, %arg2: memref<1x32x40xf16, #gpu.address_space<workgroup>>) {
+
+ // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+
+ // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
+ // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$contiguous_map]]
+ // CHECK: [[fragmentA:%.+]] = nvgpu.ldmatrix %arg0[[[c0]], [[m_coord]], [[c0]], [[k_coord]]] {numTiles = 4 : i32, transpose = false}
+ %A = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_a} : memref<4x32x1x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+
+ // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
+ // CHECK-DAG: [[k_coord:%.+]] = affine.apply [[$strided_map]]
+ // CHECK-DAG: [[fragmentB:%.+]] = nvgpu.ldmatrix %arg1[[[c0]], [[c0]], [[k_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = true}
+ %B = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map_b} : memref<4x1x32x32xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+
+ // CHECK-DAG: [[m_coord:%.+]] = affine.apply [[$strided_map]]
+ // CHECK-DAG: [[n_coord:%.+]] = affine.apply [[$contiguous_map]]
+ // CHECK-DAG: [[fragmentC:%.*]] = nvgpu.ldmatrix %arg2[[[c0]], [[m_coord]], [[n_coord]]] {numTiles = 4 : i32, transpose = false}
+ %C = vector.transfer_read %arg2[%c0, %c0, %c0], %cst {in_bounds = [true, true]} : memref<1x32x40xf16, #gpu.address_space<workgroup>>, vector<16x16xf16>
+
+ // CHECK-DAG: [[fragmentB0:%.+]] = vector.extract_strided_slice [[fragmentB]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
+ // CHECK-DAG: [[fragmentC0:%.+]] = vector.extract_strided_slice [[fragmentC]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xf16> to vector<2x2xf16>
+ // CHECK: nvgpu.mma.sync([[fragmentA]], [[fragmentB0]], [[fragmentC0]]) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+ %B0 = vector.extract_strided_slice %B {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
+ %C0 = vector.extract_strided_slice %C {offsets = [0, 0], sizes = [16, 8], strides = [1, 1]} : vector<16x16xf16> to vector<16x8xf16>
+ %D0 = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B0, %C0 : vector<16x16xf16>, vector<8x16xf16> into vector<16x8xf16>
+ vector.transfer_write %D0, %arg2[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<16x8xf16>, memref<1x32x40xf16, #gpu.address_space<workgroup>>
+
+ return
+}
+
+// -----
+
// CHECK-DAG: [[$strided_map:#.+]] = affine_map<()[s0] -> (s0 mod 16)>
// CHECK-DAG: [[$contiguous_map:#.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 8)>
More information about the Mlir-commits
mailing list