[Mlir-commits] [mlir] e5ce030 - [mlir][xegpu] Improve XeGPU op verification logic for SIMT flavor and update tests. (#127920)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Feb 24 13:50:10 PST 2025
Author: Charitha Saumya
Date: 2025-02-24T22:50:06+01:00
New Revision: e5ce0304335dc1cae6856c880d1d4e14dcf8265d
URL: https://github.com/llvm/llvm-project/commit/e5ce0304335dc1cae6856c880d1d4e14dcf8265d
DIFF: https://github.com/llvm/llvm-project/commit/e5ce0304335dc1cae6856c880d1d4e14dcf8265d.diff
LOG: [mlir][xegpu] Improve XeGPU op verification logic for SIMT flavor and update tests. (#127920)
This PR adds required changes for XeGPU ops to support the SIMT
distribution.
1. Adds verification logic for SIMT flavor for load_nd, store_nd, dpas,
load_gather and store_scatter ops.
2. Adds test cases to cover the SIMT version of these ops along with
their VC counter parts.
---------
Co-authored-by: Artem Kroviakov <71938912+akroviakov at users.noreply.github.com>
Added:
mlir/test/Dialect/XeGPU/ops.mlir
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
mlir/test/Dialect/XeGPU/invalid.mlir
Removed:
mlir/test/Dialect/XeGPU/XeGPUOps.mlir
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 78dfaef97420b..56b836d707a7d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -80,6 +80,9 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
information e.g., memref<?x?xf16>, the strides information has to be explicitly
passed via the "strides" and "const_strides" argument.
+ In SIMT mode, tensor descriptor is augmented with `SGMapAttr` which describes the
+ mapping of the tensor descriptor to the work items.
+
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
```mlir
%0 = memref.alloc() : memref<1024x1024xf32>
@@ -103,6 +106,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
%c1 = arith.constant 1 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
```
+
+ Example 4 (SIMT mode):
+ ```mlir
+ %0 = memref.alloc() : memref<1024x1024xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 8 : index
+ %1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
+ -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ ```
}];
let arguments = (ins
@@ -294,7 +306,11 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
fp32 or fp64. It implies that vnni and transpose cannot exit at the
same time.
- Example:
+ In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ which describes the mapping of the tensor to the work items. In this case, result
+ vector represents the data to be loaded by each work-item.
+
+ Example 1:
```mlir
xegpu.load_nd %1 {transpose = [1, 0],
l1_hint = #xegpu.cache_hint<cached>,
@@ -302,6 +318,13 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
l3_hint = #xegpu.cache_hint<streaming>}
: !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
```
+ Example 2 (SIMT mode):
+ ```mlir
+ xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf32,
+ #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ ```
}];
@@ -341,13 +364,25 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
Corresponding cache hint attribute will be masked.
- Example:
+ In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ which describes the mapping of the tensor to the work items. In this case, input
+ vector represents the data to be stored by each work-item.
+
+ Example 1:
```mlir
xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
```
+ Example 2 (SIMT mode):
+ ```mlir
+ xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
+ #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ ```
}];
@@ -380,10 +415,15 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
The offsets are relative offset to the current position in the number
of elements. It will result in a same type TensorDesc as the input.
- example:
+ Example 1:
```
%2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
```
+ Example 2 (SIMT mode):
+ ```
+ %2 = xegpu.update_nd_offset %1, [0, 16]:
+ !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ ```
}];
let arguments = (ins
@@ -441,14 +481,19 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
match the dimension of offsets. It may also has a second dimension corresponding to
the chunk_size if the chunk size is larger than 1.
- Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+ In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
+ with `SGMapAttr` which describes the mapping of the tensor descriptor to the work items.
+ In this case, the first dimension of the tensor descriptor represents the work-items, and
+ the second dimension represents the chunk size.
+
+ Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
```mlir
%a = memref.alloc() : memref<1024xf32>
%0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
%1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32>
```
- Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+ Example 2: It assumes subgroup size is 4, and each workitem access 8 elements.
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
```mlir
%0 = memref.alloc() : memref<1024xf32>
@@ -457,7 +502,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
```
- Example 3. It is similar to Example 2, but there is some overlaps among workitems.
+ Example 3: It is similar to Example 2, but there is some overlaps among workitems.
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
```mlir
%0 = memref.alloc() : memref<1024xf32>
@@ -465,6 +510,15 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
```
+
+ Example 4: SIMT mode
+ ```mlir
+ %0 = memref.alloc() : memref<1024xf32>
+ %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
+ %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
+ -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
+ #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ ```
}];
let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -569,6 +623,11 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
The mask operand masks out memory access so that it is safe to pass out-of-boundary
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+ In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ which describes the mapping of the tensor to the work items. In this case, result vector
+ represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
+ number of elements.
+
Example 1:
```mlir
%2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
@@ -587,6 +646,16 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
vector<16xi1> -> vector<8x16xf32>
```
+ Example 3 (SIMT mode):
+ ```mlir
+ %2 = xegpu.load %1, %0 {transpose,
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<uncached>}
+ : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
+ !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+ vector<16xi1> -> vector<8x1xf32>
+ ```
}];
@@ -608,8 +677,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
return getElementTypeOrSelf(type);
}
- Type getValueType() {
- return getValue().getType();
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
}
Type getMaskType() {
@@ -635,9 +704,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
introduced on purpose, making sure users are aware of this implicit transformation.
+ In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ which describes the mapping of the tensor to the work items. In this case, input vector
+ represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
+ number of elements.
+
Example 1:
```mlir
- %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+ xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
@@ -645,12 +719,21 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
Example 2:
```mlir
- %3 = xegpu.store %0, %1, %2 {transpose,
+ xegpu.store %0, %1, %2 {transpose,
l1_hint = #xegpu.cache_hint<uncached>,
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
```
+ Example 3 (SIMT mode):
+ ```mlir
+ xegpu.store %0, %1, %2 {transpose,
+ l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
+ !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+ ```
}];
@@ -668,8 +751,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
return getTensorDesc().getType();
}
- Type getValueType() {
- return getValue().getType();
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
}
Type getMaskType() {
@@ -695,11 +778,19 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
update the offset per work-item, so its offsets contains values representing
shifts for each work-item.
- Example:
+ Example 1:
```mlir
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
%2 = xegpu.update_offset %1, %off :
- !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<>>, vector<4xindex>
+ !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>>, vector<4xindex>
+ ```
+
+ Example 2 (SIMT mode):
+ ```mlir
+ %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
+ %2 = xegpu.update_offset %1, %off :
+ !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
+ #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
```
}];
@@ -749,6 +840,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
can be represented as `B: vector<8x16x2xf16>`.
+ In SIMT mode, DpasOp expects attributes `sg_map_a`, `sg_map_b`, and `sg_map_c`
+ which descibes the data fragment owned by each work-item w.r.t. the tensor
+ descriptor these data are loaded from.
+
Note: on PVC, the hardware can perform load with VNNI transformation when data
element type is 16-bit or lower precision, taking 2 or 4 elements from
the first dimension and inserted into the newly added innermost dimension.
@@ -757,7 +852,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
let arguments = (ins
XeGPU_DpasOpType : $lhs,
XeGPU_DpasOpType : $rhs,
- Optional<XeGPU_Vector2DType>: $acc);
+ Optional<XeGPU_Vector2DType>: $acc,
+ OptionalAttr<XeGPU_SGMapAttr>:$sg_map_a,
+ OptionalAttr<XeGPU_SGMapAttr>:$sg_map_b,
+ OptionalAttr<XeGPU_SGMapAttr>:$sg_map_c);
let results = (outs XeGPU_Vector2DType: $result);
let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index cc2e93fb19a70..ccd91a928e1dd 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -103,7 +103,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
];
-
+
let extraClassDeclaration = [{
using TensorType::clone;
using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
@@ -176,6 +176,11 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return scatter_attr.getChunkSize().getInt();
return 1;
}
+
+ // This returns a vector type that represents the fragment of data owned by
+ // a work item in SIMT mode if this tensor descriptor is used in a XeGPU
+ // load/store operation.
+ FailureOr<VectorType> getDistributedVectorType();
}];
let hasCustomAssemblyFormat = true;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 06fd03f3af3ad..e3127c66a1db3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,8 +8,13 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/LogicalResult.h"
+#include <cassert>
namespace mlir {
namespace xegpu {
@@ -239,6 +244,8 @@ LogicalResult TensorDescType::verify(
llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
mlir::Attribute encoding, mlir::Attribute sg_map) {
size_t rank = shape.size();
+ // Low-pressure types are packed in 32-bit units.
+ unsigned packingFactor = 32 / elementType.getIntOrFloatBitWidth();
if (rank != 1 && rank != 2)
return emitError() << "expected 1D or 2D tensor";
@@ -252,6 +259,16 @@ LogicalResult TensorDescType::verify(
return emitError() << "expected non-contiguous elements for 1D tensor";
if (rank == 2 && chunkSize < 2)
return emitError() << "expected chunk blocks for 2D tensor";
+ // If chunk size > 1, the second dimension of the tensor shape must be
+ // equal to chunk size and it must be a multiple of the packing factor.
+ if (chunkSize > 1) {
+ if (shape.back() != chunkSize)
+ return emitError() << "expected tensor shape[1] to match chunk size";
+ if (shape.back() % packingFactor != 0)
+ return emitError()
+ << "expected tensor shape[1] to be a multiple of packing factor "
+ << packingFactor;
+ }
}
if (auto blockAttr =
@@ -276,14 +293,13 @@ LogicalResult TensorDescType::verify(
if (scatterAttr) {
// Validate subgroup mapping rules for scattered tensors.
// A work-item's slice of the tensor with shape [sg_size] or
- // [sg_size, chunk_size] will be [1] or [1, chunks_size] respectively,
- // the mapping should reflect that.
+ // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
+ // respectively, the mapping should reflect that. This is because each
+ // work item access data in 32 bit granularity.
if (wiData[0] != 1)
return emitError()
<< "cannot map over non-contiguous scattered row elements";
-
- unsigned chunkSize = scatterAttr.getChunkSize().getInt();
- if (wiData[1] != chunkSize)
+ if (wiData[1] != packingFactor)
return emitError() << "work item data mapping must match the number of "
"contiguous elements";
}
@@ -307,6 +323,84 @@ LogicalResult TensorDescType::verify(
return success();
}
+// If tensor descriptor has a sg_map attribute it is used in SIMT mode.
+// In this mode, the distributed vector shape is determined as follows:
+// Definitions:
+// wi_data_size = wi_data[0] × wi_data[1]
+// subgroup_size = wi_layout[0] × wi_layout[1]
+// distribution_unit_size = subgroup_size × wi_data_size
+// ---------------------------------------------------------------------
+// Case 1: Regular loads/stores.
+// ---------------------------------------------------------------------
+// Distributed vector shape must be:
+// [chunk_size / wi_data_size, wi_data_size]
+// If the tensor descriptor shape is 1D, first dimension is ignored (set to 1).
+// [wi_data_size]
+// ---------------------------------------------------------------------
+// Case 2: Block loads/stores
+// ---------------------------------------------------------------------
+// Additional definitions:
+// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
+// n_distribution_units = tensor_size / distribution_unit_size
+// Given above definitions, the following conditions must be met:
+// * tensor_desc[0] % (wi_layout[0] × wi_data[0]) == 0
+// * tensor_desc[1] % (wi_layout[1] × wi_data[1]) == 0
+// Distributed vector shape must be:
+// [n_distribution_units, wi_data_size]
+FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
+ auto sgMap = llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
+ // If no sg_map is provided, tensor desc is not used in SIMT mode.
+ if (!sgMap)
+ return failure();
+
+ SmallVector<int64_t> wiData(sgMap.getWiData());
+ SmallVector<int64_t> wiLayout(sgMap.getWiLayout());
+ auto tdescShape = getShape();
+
+ auto wiDataSize = 1, sgSize = 1;
+ for (auto [wiDim, wiDataDim] : llvm::zip_equal(wiLayout, wiData)) {
+ wiDataSize *= wiDataDim;
+ sgSize *= wiDim;
+ }
+
+ // Case 1: regular loads/stores
+ auto scatterAttr = getEncodingAsScatterTensorDescAttr();
+ if (scatterAttr) {
+ auto chunkSize = scatterAttr.getChunkSize().getInt();
+ // Verify if the first dimension of the tensor descriptor shape is
+ // distributable.
+ assert(tdescShape[0] % (wiLayout[0]) == 0 &&
+ "tensor descriptor shape is not distributable");
+ if (chunkSize > 1)
+ return VectorType::get({chunkSize / wiDataSize, wiDataSize},
+ getElementType());
+ return VectorType::get({wiDataSize}, getElementType());
+ }
+
+ // Case 2: block loads/stores
+ // Tensor descriptor shape can be 1D. For the 1D case, outer dims of wiData
+ // and wiLayout must be 1.
+ if (tdescShape.size() == 1) {
+ assert((wiData[0] == 1 && wiLayout[0] == 1) &&
+ "wi_data[0] and wi_layout[0] must be 1 for 1D tensor descriptor");
+ wiData = {wiData[1]};
+ wiLayout = {wiLayout[1]};
+ }
+ // Check if the tensor descriptor shape is distributable.
+ int64_t tensorSize = 1;
+ for (auto [tdescDim, wiDim, wiDataDim] :
+ llvm::zip_equal(tdescShape, wiLayout, wiData)) {
+ assert((tdescDim % (wiDim * wiDataDim) == 0) &&
+ "tensor descriptor shape is not distributable");
+ tensorSize *= tdescDim;
+ }
+ // tensorSize must be adjusted for array_length.
+ tensorSize *= getArrayLength();
+
+ return VectorType::get({tensorSize / (sgSize * wiDataSize), wiDataSize},
+ getElementType());
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 25dc1f22f0432..fc9498a886aa3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -10,9 +10,13 @@
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
#define DEBUG_TYPE "xegpu"
@@ -73,41 +77,36 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
}
-// Validations for nd instruction arguments is successful if any of these are
-// true:
-// - tensor descriptor and the output vector shapes exactly match.
-// - tensor descriptor has a sg_map attribute and the distributed vector shape
-// matches the tensor descriptor shape when scaled using sg_map factors on
-// each dimension.
-static bool isArgShapesValid(ArrayRef<int64_t> descShape,
- ArrayRef<int64_t> valShape, SGMapAttr sgMap) {
- // Equal shapes with no distribution - no further verification needed.
- if (descShape == valShape && !sgMap)
- return true;
-
- // Unknown distribution - cannot perform operation on partial shape.
+// Helper to validate value shape of LoadNd and StoreNd ops.
+static LogicalResult
+isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
+ ArrayRef<int64_t> adjustedTdescShape,
+ function_ref<InFlightDiagnostic()> emitError) {
+ auto sgMap = tdescTy.getSGMapAttr();
+ auto valueShape = valueTy.getShape();
+ // sg_map not present means IR is in SIMD mode. In this case value shape must
+ // match adjusted tensor descriptor shape.
if (!sgMap)
- return false;
-
- // Invalid rank or mixed rank usage.
- size_t descRank = descShape.size();
- if (descRank > 2 || valShape.size() != descRank)
- return false;
-
- // For 1D, SG map is guaranteed to be unit size in the outer dimension.
- // Only take the distribution over the innermost dimension for validation.
- ArrayRef<uint32_t> wiLayout = sgMap.getWiLayout();
- SmallVector<uint32_t> mapLayout(wiLayout.begin(), wiLayout.end());
- if (descRank == 1)
- mapLayout = {wiLayout.back()};
-
- for (const auto &[factor, dim, expected] :
- llvm::zip_equal(mapLayout, valShape, descShape)) {
- if (factor * dim != expected)
- return false;
- }
-
- return true;
+ return valueShape == adjustedTdescShape
+ ? success()
+ : emitError()
+ << "Value shape " << makeString(valueShape)
+ << " is not consistent with tensor descriptor " << tdescTy;
+
+ // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+ // value shape.
+ auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
+ assert(succeeded(expectedValueShapeOrFailure) &&
+ "Failed to compute distributed vector shape for "
+ "tensor descriptor ");
+
+ return valueTy == expectedValueShapeOrFailure.value()
+ ? success()
+ : emitError()
+ << "Result shape " << makeString(valueShape)
+ << " is not consistent with distributed vector shape "
+ << makeString(expectedValueShapeOrFailure.value().getShape())
+ << " for tensor descriptor " << tdescTy;
}
//===----------------------------------------------------------------------===//
@@ -280,7 +279,8 @@ LogicalResult LoadNdOp::verify() {
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
auto array_len = tdescTy.getArrayLength();
- auto tdescShape = getShapeOf(tdescTy);
+ // adjusted tensor descriptor shape tracks the expected shape of the result.
+ auto adjustedTdescShape = getShapeOf(tdescTy);
auto valueShape = getShapeOf(valueTy);
if (getTranspose()) {
@@ -292,7 +292,7 @@ LogicalResult LoadNdOp::verify() {
});
if (valid)
- transpose(trans, tdescShape);
+ transpose(trans, adjustedTdescShape);
else
mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
}
@@ -301,8 +301,8 @@ LogicalResult LoadNdOp::verify() {
if (tdescTy.getRank() == 2) {
const int axis = 0;
auto vnni_factor = valueShape.back();
- tdescShape[axis] /= vnni_factor;
- tdescShape.push_back(vnni_factor);
+ adjustedTdescShape[axis] /= vnni_factor;
+ adjustedTdescShape.push_back(vnni_factor);
} else {
mlir::emitWarning(getLoc())
<< "Invalid Packed Attr. It is ignored (available for 2D "
@@ -311,17 +311,12 @@ LogicalResult LoadNdOp::verify() {
}
if (array_len > 1) {
- auto it = tdescShape.begin();
- tdescShape.insert(it, array_len);
+ auto it = adjustedTdescShape.begin();
+ adjustedTdescShape.insert(it, array_len);
}
- auto sgMap = tdescTy.getSGMapAttr();
- if (!isArgShapesValid(tdescShape, valueShape, sgMap))
- return emitOpError() << "Result shape doesn't match TensorDesc shape."
- << "The expected shape is " << makeString(tdescShape)
- << ". But the given shape is "
- << makeString(valueShape) << ".\n";
- return success();
+ return isArgShapesValid(tdescTy, valueTy, adjustedTdescShape,
+ [&]() { return emitOpError(); });
}
//===----------------------------------------------------------------------===//
@@ -351,14 +346,9 @@ LogicalResult StoreNdOp::verify() {
auto tdescShape = getShapeOf(dstTy);
auto valueShape = getShapeOf(valTy);
- auto sgMap = dstTy.getSGMapAttr();
- if (!isArgShapesValid(tdescShape, valueShape, sgMap))
- return emitOpError() << "Result shape doesn't match TensorDesc shape."
- << "The expected shape is " << makeString(tdescShape)
- << ". But the given shape is "
- << makeString(valueShape) << ".\n";
- return success();
+ return isArgShapesValid(dstTy, valTy, tdescShape,
+ [&]() { return emitOpError(); });
}
//===----------------------------------------------------------------------===//
@@ -478,6 +468,9 @@ LogicalResult LoadGatherOp::verify() {
auto maskTy = getMaskType();
auto valueTy = getValueType();
+ if (!valueTy)
+ return emitOpError("Expecting a vector type result.\n");
+
if (!tdescTy.isScattered())
return emitOpError("Expects a scattered TensorDesc.\n");
@@ -509,25 +502,8 @@ LogicalResult LoadGatherOp::verify() {
transpose({1, 0}, tdescShape);
}
- if (auto sgMap = tdescTy.getSGMapAttr()) {
- auto valueVecTy = cast<VectorType>(valueTy);
- const int32_t wiData =
- sgMap.getWiData()[0] > 1 ? sgMap.getWiData()[0] : sgMap.getWiData()[1];
- // All represent the same concept: a number of row elements to store.
- if (valueVecTy.getNumElements() != wiData ||
- valueVecTy.getNumElements() != tdescTy.getChunkSize()) {
- return emitOpError("Chunk size, vector size and wi_data must match.");
- }
- // Work-item's slice (i.e., vector shape to load) is [1] or [1, chunk_size].
- tdescShape[tdescTy.getRank() - 1] = 1;
- }
-
- if (valueShape != tdescShape)
- return emitOpError("Unexpected result shape")
- << "(Expected shape: " << makeString(tdescShape)
- << ", Given shape: " << makeString(valueShape) << ").\n";
-
- return success();
+ return isArgShapesValid(tdescTy, valueTy, tdescShape,
+ [&]() { return emitOpError(); });
}
//===----------------------------------------------------------------------===//
@@ -549,6 +525,10 @@ LogicalResult StoreScatterOp::verify() {
auto maskTy = getMaskType();
auto valueTy = getValueType();
+
+ if (!valueTy)
+ return emitOpError("Expecting a vector type for the value.\n");
+
auto maskShape = getShapeOf(maskTy);
auto tdescShape = getShapeOf(tdescTy);
auto valueShape = getShapeOf(valueTy);
@@ -561,25 +541,8 @@ LogicalResult StoreScatterOp::verify() {
transpose({1, 0}, tdescShape);
}
- if (auto sgMap = tdescTy.getSGMapAttr()) {
- auto valueVecTy = cast<VectorType>(valueTy);
- const int32_t wiData =
- sgMap.getWiData()[0] > 1 ? sgMap.getWiData()[0] : sgMap.getWiData()[1];
- // All represent the same concept: a number of row elements to store.
- if (valueVecTy.getNumElements() != wiData ||
- valueVecTy.getNumElements() != tdescTy.getChunkSize()) {
- return emitOpError("Chunk size, vector size and wi_data must match.");
- }
- // Work-item's slice (i.e., vector to store) is [1] or [1, chunk_size].
- tdescShape[tdescTy.getRank() - 1] = 1;
- }
-
- if (valueShape != tdescShape)
- return emitOpError("Unexpected value shape")
- << "(Expected shape: " << makeString(tdescShape)
- << ", Given shape: " << makeString(valueShape) << ").\n";
-
- return success();
+ return isArgShapesValid(tdescTy, valueTy, tdescShape,
+ [&]() { return emitOpError(); });
}
//===----------------------------------------------------------------------===//
@@ -610,20 +573,61 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
LogicalResult DpasOp::verify() {
int64_t lhsRank = getLhsType().getRank();
int64_t rhsRank = getRhsType().getRank();
-
- if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3))
- return emitOpError("expecting lhs to be a 2D vector, and rhs to be either "
- "2D or 3D (packed) vector.");
-
+ int64_t resultRank = getResultType().getRank();
auto lhsShape = getLhsType().getShape();
auto rhsShape = getRhsType().getShape();
- auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
- if (bK != lhsShape[1])
+ auto resultShape = getResultType().getShape();
+
+ auto sgMapA = getSgMapAAttr();
+ auto sgMapB = getSgMapBAttr();
+ auto sgMapC = getSgMapCAttr();
+
+ // If sg_maps are not present, then the operation is in SIMD mode.
+ if (!sgMapA && !sgMapB && !sgMapC) {
+ if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+ return emitOpError(
+ "expecting lhs and result to be a 2D vector, and rhs to be either "
+ "2D or 3D (packed) vector.");
+ auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+ if (bK != lhsShape[1])
+ return emitOpError("K-dimension mismatch.");
+ if (lhsShape[0] != resultShape[0])
+ return emitOpError("M-dimension mismatch.");
+ if (rhsShape[1] != resultShape[1])
+ return emitOpError("N-dimension mismatch.");
+ return success();
+ }
+ // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
+ // result of DPAS operation.
+ if (!sgMapA || !sgMapB || !sgMapC)
+ return emitOpError("sg_map attributes for all operands and outputs are "
+ "expected in SIMT xegpu::Dpas operation");
+
+ // In SIMT mode, All data fragments must be 2D
+ if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
+ return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+ auto wiLayoutA = sgMapA.getWiLayout();
+ auto wiLayoutB = sgMapB.getWiLayout();
+ auto wiLayoutC = sgMapC.getWiLayout();
+ // Obtain the expanded shapes of the operands and result using wi_layout.
+ // NOTE: For B, get rid of the packed dimension for the expanded shape.
+ SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+ lhsShape[1] * wiLayoutA[1]};
+ SmallVector<int64_t> expandedShapeB = {
+ rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+ SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
+ resultShape[1] * wiLayoutC[1]};
+ auto bK = expandedShapeB[0];
+ if (bK != expandedShapeA[1])
return emitOpError("K-dimension mismatch.");
+ if (expandedShapeA[0] != expandedShapeC[0])
+ return emitOpError("M-dimension mismatch.");
+ if (expandedShapeB[1] != expandedShapeC[1])
+ return emitOpError("N-dimension mismatch.");
return success();
}
-
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
deleted file mode 100644
index 472176af72b19..0000000000000
--- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
+++ /dev/null
@@ -1,381 +0,0 @@
-// RUN: mlir-opt %s | FileCheck %s
-// Verify the printed output can be parsed.
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
-// Verify the generic form can be parsed.
-// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
-
-// CHECK-LABEL: gpu.module @test {
-gpu.module @test {
-// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_with_sg_map(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_with_sg_map(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
- //CHECK: %[[C:.*]] = arith.constant 1 : index
- %c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
- gpu.return
-}
-
-// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
- %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
- gpu.return
-}
-
-// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
- gpu.return
-}
-
-// load_nd args may have
diff erent shapes, validated against sg_map
-// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
- gpu.return
-}
-
-// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2xf32>
- gpu.return
-}
-
-// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
- // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
- %1 = arith.constant dense<1.0>: vector<24x32xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
- gpu.return
-}
-
-// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
- // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
- %1 = arith.constant dense<1.0>: vector<32xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16>
- gpu.return
-}
-
-// store_nd args may have
diff erent shapes, validated against sg_map
-// CHECK: func @test_store_nd_vc_3(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc_3(%src: memref<24x32xf16>) {
- // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x2xf16>
- %1 = arith.constant dense<1.0>: vector<24x2xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- gpu.return
-}
-
-// CHECK: func @test_store_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc_4(%src: memref<24x32xf16>) {
- // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
- %1 = arith.constant dense<1.0>: vector<2xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_tdesc_vc_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_tdesc_vc_with_sg_map(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc_with_sg_map(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_load_with_sg_map(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_with_sg_map(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<2x1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<2x1xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_load_with_sg_map_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_with_sg_map_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_store_with_sg_map(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_with_sg_map(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
- %2 = arith.constant dense<2.9>: vector<2x1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @test_store_with_sg_map_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_with_sg_map_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
- %2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
- gpu.return
-}
-
-
-
-// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_vc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-// CHECK: gpu.func @test_load_gather_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_gather_vc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<4xi1>
- %0 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[c2:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %c = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c2]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %c : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
- //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2x4xf32>
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
- : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2x4xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_store_scatter_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_scatter_vc(%src: ui64) {
- //CHECK: %[[c0:.*]] = arith.constant dense<true> : vector<4xi1>
- %0 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<2x4xf32>
- %1 = arith.constant dense<2.9>: vector<2x4xf32>
- //CHECK: %[[c2:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %c = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c2]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %2 = xegpu.create_tdesc %src, %c : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
- //CHECK-SAME: vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
- : vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_vc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xindex>
- gpu.return
-}
-
-// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
-gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
- // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- gpu.return
-}
-
-
-// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
-gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
- // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
- %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
-gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
- //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
- xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @alloc_nbarrier({{.*}}) {
-gpu.func @alloc_nbarrier() {
- // CHECK: xegpu.alloc_nbarrier
- xegpu.alloc_nbarrier 8
- gpu.return
-}
-
-// CHECK: gpu.func @init_nbarrier({{.*}}) {
-gpu.func @init_nbarrier() {
- //CHECK: %[[c1:.*]] = arith.constant 1 : i8
- //CHECK: %[[c16:.*]] = arith.constant 16 : i8
- %nbarrier_id = arith.constant 1 : i8
- %threads_count = arith.constant 16 : i8
- //CHECK: xegpu.init_nbarrier %[[c1]], %[[c16]] : i8, i8 -> !xegpu.nbarrier
- %nbarrier = xegpu.init_nbarrier %nbarrier_id, %threads_count : i8, i8 -> !xegpu.nbarrier
- gpu.return
-}
-
-// CHECK: gpu.func @nbarrier_arrive(%[[arg0:.*]]: !xegpu.nbarrier) {
-gpu.func @nbarrier_arrive(%nbarrier : !xegpu.nbarrier) {
- //CHECK: xegpu.nbarrier_arrive %[[arg0]] : !xegpu.nbarrier
- xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
- gpu.return
-}
-
-// CHECK: gpu.func @nbarrier_wait(%[[arg0:.*]]: !xegpu.nbarrier) {
-gpu.func @nbarrier_wait(%nbarrier : !xegpu.nbarrier) {
- //CHECK: xegpu.nbarrier_wait %[[arg0]] : !xegpu.nbarrier
- xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @fence({{.*}}) {
-gpu.func @fence() {
- //CHECK: xegpu.fence memory_kind = global, fence_scope = workgroup
- xegpu.fence memory_kind = global, fence_scope = workgroup
- gpu.return
-}
-
-}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 86356e09de57c..88e9bbf78945b 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -78,10 +78,10 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
}
// -----
-func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
+func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
@@ -90,10 +90,10 @@ func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
}
// -----
-func.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
+func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
@@ -105,7 +105,7 @@ func.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
func.func @test_load_nd_vc_6(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32>
- // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ // expected-error at +1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32> -> vector<8x1xf32>
@@ -134,20 +134,20 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
}
// -----
-func.func @test_store_nd_vc_3(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
+func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
: vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
// -----
-func.func @test_store_nd_vc_4(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
+func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
: vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
return
@@ -157,7 +157,7 @@ func.func @test_store_nd_vc_4(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32>
- // expected-error at +1 {{Result shape doesn't match TensorDesc shape.}}
+ // expected-error at +1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
xegpu.store_nd %data, %1 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
@@ -207,6 +207,25 @@ func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
return
}
+// -----
+func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
+ // expected-error at +1 {{expected tensor shape[1] to match chunk size}}
+ -> !xegpu.tensor_desc<4x5xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4>>
+ return
+}
+
+// -----
+func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf16>, vector<4xindex>
+ // expected-error at +1 {{tensor shape[1] to be a multiple of packing factor 2}}
+ -> !xegpu.tensor_desc<4x3xf16, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
+ return
+}
+
+
// -----
func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -253,9 +272,9 @@ func.func @test_create_tdesc_sg_map_3(%src: ui64) {
func.func @test_load_gather_sg_map_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- // expected-error at +1 {{Unexpected result shape(Expected shape: [2, 1], Given shape: [1, 2])}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<1x2xf32>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
return
}
@@ -263,19 +282,9 @@ func.func @test_load_gather_sg_map_1(%src: ui64) {
func.func @test_load_gather_sg_map_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- // expected-error at +1 {{Unexpected result shape(Expected shape: [2, 1], Given shape: [2])}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<2xf32>
- return
-}
-
-// -----
-func.func @test_load_gather_sg_map_3(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- // expected-error at +1 {{Chunk size, vector size and wi_data must match}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<1xf32>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
return
}
@@ -285,9 +294,9 @@ func.func @test_store_scatter_sg_map_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<1x2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- // expected-error at +1 {{Unexpected value shape(Expected shape: [2, 1], Given shape: [1, 2])}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
return
}
@@ -296,9 +305,9 @@ func.func @test_store_scatter_sg_map_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- // expected-error at +1 {{Unexpected value shape(Expected shape: [2, 1], Given shape: [2])}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
return
}
@@ -358,11 +367,49 @@ func.func @test_dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
// -----
func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
- // expected-error at +1 {{expecting lhs to be a 2D vector, and rhs to be either 2D or 3D (packed) vector}}
+ // expected-error at +1 {{expecting lhs and result to be a 2D vector, and rhs to be either 2D or 3D (packed) vector}}
%1 = xegpu.dpas %a, %b : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
return
}
+// -----
+func.func @test_dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+ // expected-error at +1 {{K-dimension mismatch}}
+ %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+ return
+}
+
+// -----
+func.func @test_dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
+ // expected-error at +1 {{M-dimension mismatch}}
+ %1 = xegpu.dpas %a, %b : vector<16x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+ return
+}
+
+// -----
+func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
+ // expected-error at +1 {{N-dimension mismatch}}
+ %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x8x2xf16> -> vector<8x16xf32>
+ return
+}
+
+// -----
+func.func @test_dpas_sg_map_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
+ // expected-error at +1 {{sg_map attributes for all operands and outputs are expected in SIMT xegpu::Dpas operation}}
+ %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ return
+}
+
+// -----
+func.func @test_dpas_sg_map_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
+ // expected-error at +1 {{K-dimension mismatch}}
+ %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
+ sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
+ sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+ : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
+ return
+}
+
// -----
func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
%0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
new file mode 100644
index 0000000000000..c32f1905454b6
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -0,0 +1,707 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+ //CHECK: %[[C:.*]] = arith.constant 1 : index
+ %c1 = arith.constant 1 : index
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_simt_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+ //CHECK: %[[C:.*]] = arith.constant 1 : index
+ %c1 = arith.constant 1 : index
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
+gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
+gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
+gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+ %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
+ !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16x2xf16>
+ %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x8x16x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_vc_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+ gpu.return
+}
+
+// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ gpu.return
+}
+
+// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
+ %1 = arith.constant dense<1.0>: vector<24x32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ gpu.return
+}
+
+// CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
+ %1 = arith.constant dense<1.0>: vector<48x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
+ !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+
+
+// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
+ %1 = arith.constant dense<1.0>: vector<32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16>
+ gpu.return
+}
+
+
+// CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
+ %1 = arith.constant dense<1.0>: vector<2x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
+ !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_tdesc_vc(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_simt(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_tdesc_simt(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ gpu.return
+}
+
+
+// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
+gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_simt_1(%[[arg0:.*]]: memref<?xf32, 3>) {
+gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ gpu.return
+}
+
+
+// CHECK: gpu.func @test_create_tdesc_vc_2(%[[arg0:.*]]: memref<?xf32>) {
+gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_simt_2(%[[arg0:.*]]: memref<?xf32>) {
+gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_vc_3(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_tdesc_vc_3(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ gpu.return
+}
+
+
+// CHECK: gpu.func @test_create_tdesc_simt_3(%arg0: ui64) {
+gpu.func @test_create_tdesc_simt_3(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_vc(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2x4xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2x4xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_simt(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_simt(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_vc_2(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_vc_2(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_simt_2(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_simt_2(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_vc_3(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_vc_3(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<8x4xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<8x4xf16>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_load_simt_3(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_simt_3(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_store_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_vc(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x4xf32>
+ %2 = arith.constant dense<2.9>: vector<2x4xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
+ gpu.return
+}
+
+
+
+// CHECK: gpu.func @test_store_simt(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_simt(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
+ %2 = arith.constant dense<2.9>: vector<2x1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_store_vc_2(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_vc_2(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2x4xf16>
+ %2 = arith.constant dense<2.9>: vector<2x4xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x4xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x4xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
+ gpu.return
+}
+
+
+
+// CHECK: gpu.func @test_store_simt_2(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_simt_2(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
+ %2 = arith.constant dense<2.9>: vector<1x2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_store_vc_3(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_vc_3(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4xf32>
+ %2 = arith.constant dense<2.9>: vector<4xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
+ gpu.return
+}
+
+
+// CHECK: gpu.func @test_store_simt_3(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_simt_3(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
+ %2 = arith.constant dense<2.9>: vector<1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_simt(%[[arg0:.*]]: ui64) {
+gpu.func @test_prefetch_simt(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ gpu.return
+}
+
+
+// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_prefetch_vc(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_update_tdesc_simt(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
+ %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_update_tdesc_vc(%src: ui64) {
+ //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xindex>
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
+ %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xindex>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
+gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
+ // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
+gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
+ // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
+ // CHECK: sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
+ // CHECK: sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
+ sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
+ sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+ : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
+gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
+ // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+ %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
+gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
+ //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+ %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+ %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+ //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+ xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+ gpu.return
+}
+
+// CHECK: gpu.func @alloc_nbarrier({{.*}}) {
+gpu.func @alloc_nbarrier() {
+ // CHECK: xegpu.alloc_nbarrier
+ xegpu.alloc_nbarrier 8
+ gpu.return
+}
+
+// CHECK: gpu.func @init_nbarrier({{.*}}) {
+gpu.func @init_nbarrier() {
+ //CHECK: %[[c1:.*]] = arith.constant 1 : i8
+ //CHECK: %[[c16:.*]] = arith.constant 16 : i8
+ %nbarrier_id = arith.constant 1 : i8
+ %threads_count = arith.constant 16 : i8
+ //CHECK: xegpu.init_nbarrier %[[c1]], %[[c16]] : i8, i8 -> !xegpu.nbarrier
+ %nbarrier = xegpu.init_nbarrier %nbarrier_id, %threads_count : i8, i8 -> !xegpu.nbarrier
+ gpu.return
+}
+
+// CHECK: gpu.func @nbarrier_arrive(%[[arg0:.*]]: !xegpu.nbarrier) {
+gpu.func @nbarrier_arrive(%nbarrier : !xegpu.nbarrier) {
+ //CHECK: xegpu.nbarrier_arrive %[[arg0]] : !xegpu.nbarrier
+ xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
+ gpu.return
+}
+
+// CHECK: gpu.func @nbarrier_wait(%[[arg0:.*]]: !xegpu.nbarrier) {
+gpu.func @nbarrier_wait(%nbarrier : !xegpu.nbarrier) {
+ //CHECK: xegpu.nbarrier_wait %[[arg0]] : !xegpu.nbarrier
+ xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @fence({{.*}}) {
+gpu.func @fence() {
+ //CHECK: xegpu.fence memory_kind = global, fence_scope = workgroup
+ xegpu.fence memory_kind = global, fence_scope = workgroup
+ gpu.return
+}
+
+}
More information about the Mlir-commits
mailing list