[Mlir-commits] [mlir] [mlir][linalg] Add e2e test for linalg.mmt4d + pack/unpack (PR #84964)

Thu Mar 28 02:26:16 PDT 2024

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/84964

>From 38d9c39780f19fbd205ad53f248d15173041de88 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Mon, 11 Mar 2024 18:20:45 +0000
Subject: [PATCH 1/4] [mlir][linalg] Add e2e test for linalg.mmt4d

This is a follow-up for #81790. This patch basically extends:

  * test/Integration/Dialect/Linalg/CPU/mmt4d.mlir

with pack/unpack ops so that to overall computation is a matrix
multiplication (as opposed to linalg.mmt4d). For comparison (and to make
it easier to verify correctness), linalg.matmul is also included in the
test.
---
 .../Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir | 174 ++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
new file mode 100644
index 00000000000000..bf7a98fc07f086
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
@@ -0,0 +1,174 @@
+// DEFINE: %{compile} =  mlir-opt %s \
+// DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
+// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
+// DEFINE: %{entry_point} = main
+// DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
+
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s
+
+/// End-to-end test for computing matrix-multiplicatin using linalg.mmt4d. In
+/// particular, demonstrates how the following MLIR sequence (implemented in @mmt4d):
+///
+///   A_pack = tensor.pack A
+///   B_pack = tensor.pack B
+///   C_pack = tensor.pack C
+///   out_pack = linalg.mmt4d(A_pack, B_ pack, C_pack)
+///
+/// is equivalent to:
+///
+///  linalg.matmul(A, B, C)
+///
+/// (implemented in @matmul).
+
+func.func @main() {
+  // Allocate and initialise the inputs
+  %A_alloc = tensor.empty() : tensor<7x16xi32>
+  %B_alloc = tensor.empty() : tensor<16x13xi32>
+
+  %three = arith.constant 3 : i32
+  %four = arith.constant 4 : i32
+  %A = linalg.fill ins(%three : i32) outs(%A_alloc : tensor<7x16xi32>) -> tensor<7x16xi32>
+  %B = linalg.fill ins(%four : i32) outs(%B_alloc : tensor<16x13xi32>) -> tensor<16x13xi32>
+  %C = arith.constant dense<[
+    [ 1,  8, 15, 22, 29, 36, 43, 50, 57, 64, 71, 78, 85],
+    [ 2,  9, 16, 23, 30, 37, 44, 51, 58, 65, 72, 79, 86],
+    [ 3, 10, 17, 24, 31, 38, 45, 52, 59, 66, 73, 80, 87],
+    [ 4, 11, 18, 25, 32, 39, 46, 53, 60, 67, 74, 81, 88],
+    [ 5, 12, 19, 26, 33, 40, 47, 54, 61, 68, 75, 82, 89],
+    [ 6, 13, 20, 27, 34, 41, 48, 55, 62, 69, 76, 83, 90],
+    [ 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91]
+  ]> : tensor<7x13xi32>
+  
+  // Matrix multiplication via linalg.mmt4d
+  // CHECK: Unranked Memref
+  // CHECK:  [193,   200,   207,   214,   221,   228,   235,   242,   249,   256,   263,   270,   277]
+  // CHECK:  [194,   201,   208,   215,   222,   229,   236,   243,   250,   257,   264,   271,   278]
+  // CHECK:  [195,   202,   209,   216,   223,   230,   237,   244,   251,   258,   265,   272,   279]
+  // CHECK:  [196,   203,   210,   217,   224,   231,   238,   245,   252,   259,   266,   273,   280]
+  // CHECK:  [197,   204,   211,   218,   225,   232,   239,   246,   253,   260,   267,   274,   281]
+  // CHECK:  [198,   205,   212,   219,   226,   233,   240,   247,   254,   261,   268,   275,   282]
+  // CHECK:  [199,   206,   213,   220,   227,   234,   241,   248,   255,   262,   269,   276,   283]
+  %C_mmt4d = func.call @mmt4d(%A, %B, %C) : (tensor<7x16xi32>, tensor<16x13xi32>, tensor<7x13xi32>) -> tensor<7x13xi32>
+  %xf = tensor.cast %C_mmt4d : tensor<7x13xi32> to tensor<*xi32>
+  call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
+
+  // Matrix multiplicaiton with linalg.matmul
+  // CHECK: Unranked Memref
+  // CHECK:  [193,   200,   207,   214,   221,   228,   235,   242,   249,   256,   263,   270,   277]
+  // CHECK:  [194,   201,   208,   215,   222,   229,   236,   243,   250,   257,   264,   271,   278]
+  // CHECK:  [195,   202,   209,   216,   223,   230,   237,   244,   251,   258,   265,   272,   279]
+  // CHECK:  [196,   203,   210,   217,   224,   231,   238,   245,   252,   259,   266,   273,   280]
+  // CHECK:  [197,   204,   211,   218,   225,   232,   239,   246,   253,   260,   267,   274,   281]
+  // CHECK:  [198,   205,   212,   219,   226,   233,   240,   247,   254,   261,   268,   275,   282]
+  // CHECK:  [199,   206,   213,   220,   227,   234,   241,   248,   255,   262,   269,   276,   283]
+  %C_matmul = func.call @matmul(%A, %B, %C) : (tensor<7x16xi32>, tensor<16x13xi32>, tensor<7x13xi32>) -> tensor<7x13xi32>
+  %xf_2 = tensor.cast %C_matmul : tensor<7x13xi32> to tensor<*xi32>
+  call @printMemrefI32(%xf_2) : (tensor<*xi32>) -> ()
+
+  return
+}
+
+func.func @matmul(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
+  %C_matmul = linalg.matmul ins(%A, %B: tensor<7x16xi32>, tensor<16x13xi32>) 
+                            outs(%C: tensor<7x13xi32>) -> tensor<7x13xi32>
+
+  return %C_matmul : tensor<7x13xi32>
+}
+
+func.func @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
+  %zero = arith.constant 0 : i32
+
+  %cst = arith.constant 0 : i32
+  %A_pack_empty = tensor.empty() : tensor<2x16x8x1xi32>
+  %B_pack_empty = tensor.empty() : tensor<2x16x8x1xi32>
+  %C_pack_empty = tensor.empty() : tensor<2x2x8x8xi32>
+
+  // Pack matrices
+  %A_pack = tensor.pack %A padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32>
+  %B_pack = tensor.pack %B padding_value(%zero : i32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %B_pack_empty : tensor<16x13xi32> -> tensor<2x16x8x1xi32>
+  %C_pack = tensor.pack %C padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_pack_empty : tensor<7x13xi32> -> tensor<2x2x8x8xi32>
+
+  // MMT4D
+  %mmt4d = linalg.mmt4d ins(%A_pack, %B_pack : tensor<2x16x8x1xi32>, tensor<2x16x8x1xi32>) outs(%C_pack : tensor<2x2x8x8xi32>) -> tensor<2x2x8x8xi32>
+
+  // Unpack output
+  %C_out_empty = tensor.empty() : tensor<7x13xi32>
+  %C_out_unpack = tensor.unpack %mmt4d outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_out_empty : tensor<2x2x8x8xi32> -> tensor<7x13xi32>
+
+  return %C_out_unpack : tensor<7x13xi32>
+}
+
+module @transforms attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
+   %mmt4d = transform.collect_matching @match_mmt4d in %module : (!transform.any_op) -> (!transform.any_op)
+   %func = transform.get_parent_op %mmt4d {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
+
+   // Step 1: Tile
+   // Tile parallel dims
+   %tiled_linalg_op_p, %loops:4 = transform.structured.tile_using_for %mmt4d[1, 1, 0, 8, 8, 0]
+     : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+   // Tile reduction dims
+   %tiled_linalg_op_r, %loops2:2 = transform.structured.tile_using_for %tiled_linalg_op_p[0, 0, 1, 0, 0, 1]
+     : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+   // Step 2: Vectorize
+   transform.structured.vectorize %tiled_linalg_op_r : !transform.any_op
+
+   // Step 3: Simplify
+   // vector.multi_reduction --> vector.contract
+   // Generates a 6-dim vector.contract with the dim matching the original MMT4D Op
+   // and with the following split into parallel and reduction dims:
+   //    * parallel, parallel, reduction, parallel, parallel, reduction
+   transform.apply_patterns to %func {
+     transform.apply_patterns.vector.reduction_to_contract
+     // Reduce the rank of xfer ops. This transforms vector.contract to be
+     // more matmul-like and to enable the lowering to outer product Ops.
+     transform.apply_patterns.vector.transfer_permutation_patterns
+   } : !transform.op<"func.func">
+
+   // Hoisting and LICM - not strictly required
+   %func_h = transform.structured.hoist_redundant_vector_transfers %func
+     : (!transform.op<"func.func">) -> !transform.op<"func.func">
+   %all_loops = transform.structured.match interface{LoopLikeInterface} in %func_h
+     : (!transform.op<"func.func">) -> !transform.any_op
+   transform.apply_licm to %all_loops : !transform.any_op
+   transform.loop.hoist_loop_invariant_subsets %all_loops : !transform.any_op
+
+   // Simplify the 6-dim vector.contract into a 3-dim matmul-like
+   // vector.contract with the following split into parallel and reduction
+   // dims:
+   //    * parallel, parallel, reduction
+   transform.apply_patterns to %func_h {
+     transform.apply_patterns.vector.reduction_to_contract
+     transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+     transform.apply_patterns.canonicalization
+   } : !transform.op<"func.func">
+
+   // Step 4. Lower tensor.pack
+   %pack = transform.structured.match ops{["tensor.pack"]} in %func_h
+     : (!transform.op<"func.func">) -> !transform.op<"tensor.pack">
+   transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">)
+     -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">)
+
+   // Step 5. Lower tensor.unpack
+   %unpack = transform.structured.match ops{["tensor.unpack"]} in %func_h
+      : (!transform.op<"func.func">) -> !transform.op<"tensor.unpack">
+    transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">)
+      -> (!transform.op<"tensor.empty">,
+          !transform.op<"linalg.transpose">,
+          !transform.op<"tensor.collapse_shape">,
+          !transform.op<"tensor.extract_slice">)
+    transform.yield
+  }
+
+  transform.named_sequence @match_mmt4d(
+      %entry: !transform.any_op {transform.readonly}) -> !transform.any_op {
+    transform.match.operation_name %entry ["linalg.mmt4d"] : !transform.any_op
+    transform.yield %entry : !transform.any_op
+  }
+}
+
+func.func private @printMemrefI32(%ptr : tensor<*xi32>)

>From 41d8338c8ca3a292aef3b6af642b2d6374ed1ae6 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 13 Mar 2024 16:09:19 +0000
Subject: [PATCH 2/4] fixup! [mlir][linalg] Add e2e test for linalg.mmt4d

---
 .../Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir          | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
index bf7a98fc07f086..9769c686861b16 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
@@ -1,6 +1,6 @@
 // DEFINE: %{compile} =  mlir-opt %s \
 // DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
-// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -test-lower-to-llvm -o %t -o %t
 // DEFINE: %{entry_point} = main
 // DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
@@ -9,13 +9,13 @@
 
 // RUN: %{run} | FileCheck %s
 
-/// End-to-end test for computing matrix-multiplicatin using linalg.mmt4d. In
+/// End-to-end test for computing matrix-multiplication using linalg.mmt4d. In
 /// particular, demonstrates how the following MLIR sequence (implemented in @mmt4d):
 ///
 ///   A_pack = tensor.pack A
 ///   B_pack = tensor.pack B
 ///   C_pack = tensor.pack C
-///   out_pack = linalg.mmt4d(A_pack, B_ pack, C_pack)
+///   out_pack = linalg.mmt4d(A_pack, B_pack, C_pack)
 ///
 /// is equivalent to:
 ///
@@ -55,7 +55,7 @@ func.func @main() {
   %xf = tensor.cast %C_mmt4d : tensor<7x13xi32> to tensor<*xi32>
   call @printMemrefI32(%xf) : (tensor<*xi32>) -> ()
 
-  // Matrix multiplicaiton with linalg.matmul
+  // Matrix multiplication with linalg.matmul
   // CHECK: Unranked Memref
   // CHECK:  [193,   200,   207,   214,   221,   228,   235,   242,   249,   256,   263,   270,   277]
   // CHECK:  [194,   201,   208,   215,   222,   229,   236,   243,   250,   257,   264,   271,   278]
@@ -87,7 +87,7 @@ func.func @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi3
   %C_pack_empty = tensor.empty() : tensor<2x2x8x8xi32>
 
   // Pack matrices
-  %A_pack = tensor.pack %A padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32>
+  %A_pack = tensor.pack %A padding_value(%zero : i32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32>
   %B_pack = tensor.pack %B padding_value(%zero : i32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %B_pack_empty : tensor<16x13xi32> -> tensor<2x16x8x1xi32>
   %C_pack = tensor.pack %C padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_pack_empty : tensor<7x13xi32> -> tensor<2x2x8x8xi32>
 

>From ded93ef73dff25f13dac5d7ddd3f246c8b4036ad Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 27 Mar 2024 20:20:09 +0000
Subject: [PATCH 3/4] fixup! fixup! [mlir][linalg] Add e2e test for
 linalg.mmt4d

Address PR comments
---
 .../Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir          | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
index 9769c686861b16..d06ba7ac664046 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
@@ -1,13 +1,12 @@
 // DEFINE: %{compile} =  mlir-opt %s \
 // DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
-// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -test-lower-to-llvm -o %t -o %t
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" \
+// DEFINE:    -buffer-deallocation-pipeline  -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm
 // DEFINE: %{entry_point} = main
-// DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \
+// DEFINE: %{run} = mlir-cpu-runner -e %{entry_point} -entry-point-result=void \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
 
-// RUN: %{compile}
-
-// RUN: %{run} | FileCheck %s
+// RUN: %{compile} | %{run} | FileCheck %s
 
 /// End-to-end test for computing matrix-multiplication using linalg.mmt4d. In
 /// particular, demonstrates how the following MLIR sequence (implemented in @mmt4d):
@@ -81,7 +80,6 @@ func.func @matmul(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi
 func.func @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
   %zero = arith.constant 0 : i32
 
-  %cst = arith.constant 0 : i32
   %A_pack_empty = tensor.empty() : tensor<2x16x8x1xi32>
   %B_pack_empty = tensor.empty() : tensor<2x16x8x1xi32>
   %C_pack_empty = tensor.empty() : tensor<2x2x8x8xi32>

>From fd75b32721876fccccc48874f6bd9ea6842ea683 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Thu, 28 Mar 2024 09:25:05 +0000
Subject: [PATCH 4/4] fixup! fixup! fixup! [mlir][linalg] Add e2e test for
 linalg.mmt4d

Add "private-function-dynamic-ownership" to
"-buffer-deallocation-pipeline". This prevents the bufferizer from
inserting `bufferization.clone` (which are a bit tricky to remove) while
still avoiding leaks.
---
 .../Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
index d06ba7ac664046..5680882dccb1ee 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir
@@ -1,7 +1,8 @@
 // DEFINE: %{compile} =  mlir-opt %s \
 // DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
 // DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" \
-// DEFINE:    -buffer-deallocation-pipeline  -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm
+// DEFINE:    -buffer-deallocation-pipeline="private-function-dynamic-ownership" \
+// DEFINE:    -cse -canonicalize -test-lower-to-llvm
 // DEFINE: %{entry_point} = main
 // DEFINE: %{run} = mlir-cpu-runner -e %{entry_point} -entry-point-result=void \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
@@ -70,14 +71,14 @@ func.func @main() {
   return
 }
 
-func.func @matmul(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
+func.func private @matmul(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
   %C_matmul = linalg.matmul ins(%A, %B: tensor<7x16xi32>, tensor<16x13xi32>) 
                             outs(%C: tensor<7x13xi32>) -> tensor<7x13xi32>
 
   return %C_matmul : tensor<7x13xi32>
 }
 
-func.func @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
+func.func private @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor<7x13xi32>) -> tensor<7x13xi32> {
   %zero = arith.constant 0 : i32
 
   %A_pack_empty = tensor.empty() : tensor<2x16x8x1xi32>