[Mlir-commits] [mlir] [mlir][ArmSME] Add comments in tile-spills-and-fills.mlir (PR #91450)
Andrzej WarzyĆski
llvmlistbot at llvm.org
Wed May 8 08:18:02 PDT 2024
https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/91450
>From 3ee7ea28f9a57a1eefe831c8a207a7d27dcb6ee0 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 8 May 2024 10:02:28 +0100
Subject: [PATCH 1/2] [mlir][ArmSME] Add comments in tile-spills-and-fills.mlir
* adds comments in tile-spills-and-fills.mlir
* adds comments in ArmSMEIntrinsicOps.td
* updates test in tile-spills-and-fills.mlir not to return 2D scalable
vectors (e.g. vector<[4]x[4]xf32>) - that's not supported and not
needed for that test
---
.../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td | 4 +-
.../ArmSMEToLLVM/tile-spills-and-fills.mlir | 42 ++++++++++++++++---
2 files changed, 39 insertions(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index f051e03efbcda..0e38325f9891a 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -115,7 +115,7 @@ class ArmSME_IntrLoadStoreOp<string mnemonic>
/*immArgPositions=*/[2],
/*immArgAttrNames=*/["tile_id"]>;
-// Loads
+// Loads (from memory to ZA tile slice)
class ArmSME_IntrLoadOp<string mnemonic>
: ArmSME_IntrLoadStoreOp<mnemonic>,
Arguments<(ins Arg<SVEPredicate, "Vector predicate">:$predicate,
@@ -134,7 +134,7 @@ def LLVM_aarch64_sme_ld1w_vert : ArmSME_IntrLoadOp<"ld1w.vert">;
def LLVM_aarch64_sme_ld1d_vert : ArmSME_IntrLoadOp<"ld1d.vert">;
def LLVM_aarch64_sme_ld1q_vert : ArmSME_IntrLoadOp<"ld1q.vert">;
-// Stores
+// Stores (ZA tile slice to memory)
class ArmSME_IntrStoreOp<string mnemonic>
: ArmSME_IntrLoadStoreOp<mnemonic>,
Arguments<(ins Arg<SVEPredicate, "Vector predicate">:$predicate,
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir b/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
index 7a9e6b4215754..ece6e6d2e7c12 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
@@ -76,13 +76,23 @@ func.func @use_too_many_tiles() {
// AFTER-LLVM-LOWERING-SAME: {arm_sme.in_memory_tile_id = 16 : i32} : memref<?x?xi16>
//
// AFTER-LLVM-LOWERING-NOT: scf.for
-// Note: 17 is the mask for the 32-bit tile 0.
+
+/// 1. Create/allocate %0
+/// Note: 17 is the mask for the 32-bit tile 0.
+
// AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 17 : i32}>
//
// AFTER-LLVM-LOWERING-NOT: scf.for
-// Note: 34 is the mask for the 32-bit tile 1.
+
+/// 2. Create/allocate %1
+/// Note: 34 is the mask for the 32-bit tile 1.
+
// AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 34 : i32}>
-//
+
+/// 3. Spill %0 (the 32-bit tile 0), so that %2 can be allocated (16 bit
+/// tile 0). Note that this is spilling vector<[8]x[8]xi16> rather than
+/// vector<[4]x[4]xi32>
+
// AFTER-LLVM-LOWERING: scf.for
// AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_H]] step %[[C1]] {
// AFTER-LLVM-LOWERING: %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
@@ -92,8 +102,14 @@ func.func @use_too_many_tiles() {
// AFTER-LLVM-LOWERING-NEXT: "arm_sme.intr.ld1h.horiz"({{.*}}, %[[SLICE_PTR]], {{.*}}) <{tile_id = 0 : i32}>
// AFTER-LLVM-LOWERING-NEXT: vector.store %[[SLICE]], %[[TILE_ALLOCA]]
// AFTER-LLVM-LOWERING-NEXT: }
-// Note: 85 is the mask for the 16-bit tile 0.
+
+/// 4. Create/allocate %2
+/// Note: 85 is the mask for the 16-bit tile 0.
+
// AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 85 : i32}>
+
+/// 5. Re-load %0
+
// AFTER-LLVM-LOWERING: scf.for
// AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_H]] step %[[C1]] {
// AFTER-LLVM-LOWERING: %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
@@ -116,7 +132,7 @@ func.func @very_excessive_spills(%memref : memref<?x?xf32>) -> vector<[4]x[4]xf3
%tile = arm_sme.get_tile : vector<[4]x[4]xf32>
%mask = vector.constant_mask [4] : vector<[4]xi1>
%loadSlice = arm_sme.load_tile_slice %memref[%c0, %c0], %mask, %tile, %c0 : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
- return %loadSlice : vector<[4]x[4]xf32>
+ "test.some_use"(%loadSlice) : (vector<[4]x[4]xf32>) -> ()
}
// AFTER-TILE-ALLOC-LABEL: @very_excessive_spills
// AFTER-TILE-ALLOC: arm_sme.get_tile
@@ -133,22 +149,38 @@ func.func @very_excessive_spills(%memref : memref<?x?xf32>) -> vector<[4]x[4]xf3
// AFTER-LLVM-LOWERING-DAG: %[[TILE_ALLOCA:.*]] = memref.alloca(%[[SVL_S]], %[[SVL_S]])
// AFTER-LLVM-LOWERING-SAME: {arm_sme.in_memory_tile_id = 16 : i32} : memref<?x?xf32>
//
+
+/// 1. Swap %useAllTiles and %tile - note that this will only swap one 32-bit
+/// tile (vector<[4]x[4]xf32>)
+
// AFTER-LLVM-LOWERING: scf.for
// AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_S]] step %[[C1]] {
// AFTER-LLVM-LOWERING: %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
// AFTER-LLVM-LOWERING: %[[BASE_PTR:.*]] = llvm.extractvalue %[[MEM_DESC]][1]
// AFTER-LLVM-LOWERING: %[[SLICE_PTR:.*]] = llvm.getelementptr %[[BASE_PTR]]
+// Read ZA tile slice -> vector
// AFTER-LLVM-LOWERING: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"{{.*}} <{tile_id = 0 : i32}>
+/// Load vector from memory -> ZA tile
// AFTER-LLVM-LOWERING-NEXT: "arm_sme.intr.ld1w.horiz"({{.*}}, %[[SLICE_PTR]], {{.*}}) <{tile_id = 0 : i32}>
+/// Store ZA tile slice in memory
// AFTER-LLVM-LOWERING-NEXT: vector.store %[[SLICE]], %[[TILE_ALLOCA]]
// AFTER-LLVM-LOWERING-NEXT: }
+
+/// 2. Load into %tile
// AFTER-LLVM-LOWERING: "arm_sme.intr.ld1w.horiz"{{.*}} <{tile_id = 0 : i32}>
+
+/// 3. Swap %useAllTiles and %tile - note that this will only swap one 32-bit
+/// tile (vector<[4]x[4]xf32>)
+
// AFTER-LLVM-LOWERING: scf.for
// AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_S]] step %[[C1]] {
// AFTER-LLVM-LOWERING: %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[TILE_ALLOCA]]
// AFTER-LLVM-LOWERING: %[[BASE_PTR:.*]] = llvm.extractvalue %[[MEM_DESC]][1]
// AFTER-LLVM-LOWERING: %[[SLICE_PTR:.*]] = llvm.getelementptr %[[BASE_PTR]]
+/// Read ZA tile slice -> vector
// AFTER-LLVM-LOWERING: %[[SLICE:.*]] = "arm_sme.intr.read.horiz"{{.*}} <{tile_id = 0 : i32}>
+/// Load vector from memory -> ZA tile
// AFTER-LLVM-LOWERING-NEXT: "arm_sme.intr.ld1w.horiz"({{.*}}, %[[SLICE_PTR]], {{.*}}) <{tile_id = 0 : i32}>
+/// Store ZA tile slice in memory
// AFTER-LLVM-LOWERING-NEXT: vector.store %[[SLICE]], %[[TILE_ALLOCA]]
// AFTER-LLVM-LOWERING-NEXT: }
>From f009f5acae970b9f8cb92dcd2b575870e381e5ed Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Wed, 8 May 2024 16:17:37 +0100
Subject: [PATCH 2/2] fixup! [mlir][ArmSME] Add comments in
tile-spills-and-fills.mlir
Update comments
---
.../ArmSMEToLLVM/tile-spills-and-fills.mlir | 20 ++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir b/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
index ece6e6d2e7c12..bbe84c19d9c6e 100644
--- a/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
+++ b/mlir/test/Conversion/ArmSMEToLLVM/tile-spills-and-fills.mlir
@@ -72,26 +72,31 @@ func.func @use_too_many_tiles() {
// AFTER-LLVM-LOWERING-DAG: %[[C8:.*]] = arith.constant 8 : index
// AFTER-LLVM-LOWERING-DAG: %[[VSCALE:.*]] = vector.vscale
// AFTER-LLVM-LOWERING-DAG: %[[SVL_H:.*]] = arith.muli %[[VSCALE]], %[[C8]] : index
+
+/// 0. Create an in-memory-tile
+/// Note: 16 is the mask for the first in-memory tile
+
// AFTER-LLVM-LOWERING-DAG: %[[TILE_ALLOCA:.*]] = memref.alloca(%[[SVL_H]], %[[SVL_H]])
// AFTER-LLVM-LOWERING-SAME: {arm_sme.in_memory_tile_id = 16 : i32} : memref<?x?xi16>
//
// AFTER-LLVM-LOWERING-NOT: scf.for
-/// 1. Create/allocate %0
+/// 1. The following instruciton corresponds to %0 after tile allocation
/// Note: 17 is the mask for the 32-bit tile 0.
// AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 17 : i32}>
//
// AFTER-LLVM-LOWERING-NOT: scf.for
-/// 2. Create/allocate %1
+/// 2. The following instruciton corresponds to %1 after tile allocation
/// Note: 34 is the mask for the 32-bit tile 1.
// AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 34 : i32}>
-/// 3. Spill %0 (the 32-bit tile 0), so that %2 can be allocated (16 bit
-/// tile 0). Note that this is spilling vector<[8]x[8]xi16> rather than
-/// vector<[4]x[4]xi32>
+/// 3. swap(<in-memory-tile>, tile 0).
+/// This can be interpreted as spilling %0 (the 32-bit tile 0), so that
+/// %2 can be allocated a tile (16 bit tile 0). Note that this is
+/// swapping vector<[8]x[8]xi16> rather than vector<[4]x[4]xi32>.
// AFTER-LLVM-LOWERING: scf.for
// AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_H]] step %[[C1]] {
@@ -103,12 +108,13 @@ func.func @use_too_many_tiles() {
// AFTER-LLVM-LOWERING-NEXT: vector.store %[[SLICE]], %[[TILE_ALLOCA]]
// AFTER-LLVM-LOWERING-NEXT: }
-/// 4. Create/allocate %2
+/// 4. The following instruciton corresponds to %3 after tile allocation
/// Note: 85 is the mask for the 16-bit tile 0.
// AFTER-LLVM-LOWERING: "arm_sme.intr.zero"() <{tile_mask = 85 : i32}>
-/// 5. Re-load %0
+/// 5. swap(<inMemoryTile>, tile 0)
+/// This can be interpreted as restoring %0.
// AFTER-LLVM-LOWERING: scf.for
// AFTER-LLVM-LOWERING-SAME: %[[C0]] to %[[SVL_H]] step %[[C1]] {
More information about the Mlir-commits
mailing list