[Mlir-commits] [mlir] [MLIR][XeGPU] Extend SGMapAttr (PR #132425)
Chao Chen
llvmlistbot at llvm.org
Fri Mar 21 11:09:09 PDT 2025
https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/132425
>From 4838b524a635e566175aa087440283b909555402 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 16:54:15 +0000
Subject: [PATCH 1/4] extend sg_map from subgroup to workgroup
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 76 ++++--
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 63 +++--
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 16 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 126 ++++------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 157 ++++++++----
mlir/test/Dialect/XeGPU/invalid.mlir | 110 +++++----
mlir/test/Dialect/XeGPU/ops.mlir | 230 ++++++++++--------
7 files changed, 457 insertions(+), 321 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 0136b18ccfa94..7adb9df3c6b25 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -154,33 +154,81 @@ def XeGPU_FenceScopeAttr:
let assemblyFormat = "$value";
}
-def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
+def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level code
+def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
+def XeGPU_ScopeWI: I32EnumAttrCase<"WI", 2, "wi">; // simt level code
+
+def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
+ [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_ScopeAttr
+ : EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
+ let summary = [{Describe the stage of lowering progress}];
+ let assemblyFormat = "``$value";
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let summary = [{
Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
}];
let description = [{
- To distribute the XeGPU operation to work items, the tensor_desc must be specified with the sg_map
- attribute at the tensor description creation time.
- Within the `sg_map`, `wi_layout` specifies the layout of work items,
- describing the mapping of work items to the tensor.
- wi_layout[0] x wi_layout[1] must be equal to the total number of work items within a subgroup.
- `wi_data` specifies the minimum number of data elements assigned to each work item for a single distribution.
-
- E.g., #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
- In this example, the subgroup has 16 work items in wi_layout=[1, 16],
- each accessing 1 element as specified by wi_data=[1, 1].
+ XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
+ upon the tensor description creation. LayoutAttr contains the following parameters.
+
+ * scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
+ it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
+ contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
+ wi_layout and wi_data, it will be considered as workitem level.
+ * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
+ * sg_data: [optional] specifies the data size accessed per subgroup.
+ * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
+ The first dimension in the order list is the fastest-changing dimension.
+ * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
+ * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
`wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
+
+ E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+ In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
+
+ E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+ In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
+ and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
+
}];
let parameters = (ins
- ArrayRefParameter<"uint32_t">:$wi_layout,
- ArrayRefParameter<"uint32_t">:$wi_data
+ OptionalParameter<"ScopeAttr">: $scope,
+ OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
+ OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
+ OptionalParameter<"DenseI32ArrayAttr">: $order,
+ "DenseI32ArrayAttr": $wi_layout,
+ "DenseI32ArrayAttr": $wi_data
);
+ let extraClassDeclaration = [{
+ bool isForWorkgroupLevel() {
+ if (!getScope())
+ return getSgLayout() && getSgData();
+ return getScope() == ScopeAttr::get(getContext(), Scope::WG);
+ }
+
+ bool isForSubgroupLevel() {
+ return getScope() == ScopeAttr::get(getContext(), Scope::SG);
+ }
+
+ bool isForWorkItemLevel() {
+ if (!getScope())
+ return !getSgLayout() && !getSgData() && !getOrder();
+ return getScope() == ScopeAttr::get(getContext(), Scope::WI);
+ }
+ }];
- let hasCustomAssemblyFormat = 1;
+ let assemblyFormat = "`<` struct(params) `>`";
let genVerifyDecl = 1;
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 56b836d707a7d..6b27ae3b2754c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -80,7 +80,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
information e.g., memref<?x?xf16>, the strides information has to be explicitly
passed via the "strides" and "const_strides" argument.
- In SIMT mode, tensor descriptor is augmented with `SGMapAttr` which describes the
+ In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
mapping of the tensor descriptor to the work items.
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
%c0 = arith.constant 0 : index
%c1 = arith.constant 8 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
@@ -306,7 +306,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
fp32 or fp64. It implies that vnni and transpose cannot exit at the
same time.
- In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, result
vector represents the data to be loaded by each work-item.
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32,
- #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
```
@@ -364,7 +364,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
Corresponding cache hint attribute will be masked.
- In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, input
vector represents the data to be stored by each work-item.
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
- #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
Example 2 (SIMT mode):
```
%2 = xegpu.update_nd_offset %1, [0, 16]:
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
@@ -482,7 +482,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
the chunk_size if the chunk size is larger than 1.
In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
- with `SGMapAttr` which describes the mapping of the tensor descriptor to the work items.
+ with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
In this case, the first dimension of the tensor descriptor represents the work-items, and
the second dimension represents the chunk size.
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
%off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
- #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
```
}];
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
let hasVerifier = 1;
}
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
let summary = "prefetches a set of scattered data points to cache";
let description = [{
@@ -623,7 +623,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
The mask operand masks out memory access so that it is safe to pass out-of-boundary
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
- In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, result vector
represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
number of elements.
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<uncached>}
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
- !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+ !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
vector<16xi1> -> vector<8x1xf32>
```
@@ -704,7 +704,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
introduced on purpose, making sure users are aware of this implicit transformation.
- In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, input vector
represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
number of elements.
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
- !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+ !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
```
}];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
%2 = xegpu.update_offset %1, %off :
!xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
- #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
```
}];
@@ -840,9 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
can be represented as `B: vector<8x16x2xf16>`.
- In SIMT mode, DpasOp expects attributes `sg_map_a`, `sg_map_b`, and `sg_map_c`
- which descibes the data fragment owned by each work-item w.r.t. the tensor
- descriptor these data are loaded from.
+ In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
+ which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
+ these data are loaded from.
Note: on PVC, the hardware can perform load with VNNI transformation when data
element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -853,9 +853,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
XeGPU_DpasOpType : $lhs,
XeGPU_DpasOpType : $rhs,
Optional<XeGPU_Vector2DType>: $acc,
- OptionalAttr<XeGPU_SGMapAttr>:$sg_map_a,
- OptionalAttr<XeGPU_SGMapAttr>:$sg_map_b,
- OptionalAttr<XeGPU_SGMapAttr>:$sg_map_c);
+ OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
+ OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
+ OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
let results = (outs XeGPU_Vector2DType: $result);
let extraClassDeclaration = [{
@@ -876,6 +876,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
VectorType getResultType() {
return getResult().getType();
}
+
+ bool hasAcc() {
+ return getAcc() != nullptr;
+ }
}];
let assemblyFormat = [{
@@ -979,4 +983,21 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
let extraClassDeclaration = extraBaseClassDeclaration;
}
+def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
+ let summary = "Convert the sg layout of the input operand";
+ let description = [{
+ convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
+ }];
+ let arguments = (ins XeGPU_Vector2DType: $source,
+ XeGPU_LayoutAttr: $srcMap,
+ XeGPU_LayoutAttr: $resMap
+ );
+ let results = (outs XeGPU_Vector2DType: $result);
+ let assemblyFormat = [{
+ $source attr-dict `:` type($source)
+ }];
+
+ let hasVerifier = 1;
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index ccd91a928e1dd..c92ea42efde3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
element-type ::= float-type | integer-type | index-type
dim-list := (static-dim-list `x`)?
static-dim-list ::= decimal-literal `x` decimal-literal
- attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, sg_map `<` wi_layout = value, wi_data = value `>`)?
+ attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
```
Examples:
@@ -78,15 +78,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
- // A TensorDesc with a sg_map
- xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // A TensorDesc with a layout
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
"mlir::Type": $elementType,
OptionalParameter<"mlir::Attribute">: $encoding,
- OptionalParameter<"mlir::Attribute">: $sg_map);
+ OptionalParameter<"mlir::Attribute">: $layout);
let builders = [
TypeBuilderWithInferredContext<(ins
@@ -95,13 +95,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
CArg<"int", "1">: $array_length,
CArg<"bool", "true">: $boundary_check,
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
- CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>,
+ CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>,
TypeBuilderWithInferredContext<(ins
"llvm::ArrayRef<int64_t>": $shape,
"mlir::Type": $elementType,
CArg<"int", "1">: $chunk_size,
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
- CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
+ CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>
];
let extraClassDeclaration = [{
@@ -127,8 +127,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
}
- SGMapAttr getSGMapAttr() const {
- return llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
+ LayoutAttr getLayoutAttr() const {
+ return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
}
xegpu::MemorySpace getMemorySpace() const {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78c242571935c..52b9f2c192b3f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -68,73 +68,39 @@ LogicalResult ScatterTensorDescAttr::verify(
}
//===----------------------------------------------------------------------===//
-// XeGPU_SGMapAttr
+// XeGPU_LayoutAttr
//===----------------------------------------------------------------------===//
-namespace {
-template <typename T, unsigned N>
-LogicalResult parseIntArrayField(::mlir::AsmParser &parser,
- llvm::SmallVector<T, N> &result,
- llvm::StringRef fieldName) {
- if (failed(parser.parseKeyword(fieldName))) {
- parser.emitError(parser.getCurrentLocation(),
- "unexpected field name. Expected " + fieldName + ".");
- return failure();
+LogicalResult
+LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+ ScopeAttr scope,
+ DenseI32ArrayAttr sg_layout,
+ DenseI32ArrayAttr sg_data,
+ DenseI32ArrayAttr order,
+ DenseI32ArrayAttr wi_layout,
+ DenseI32ArrayAttr wi_data) {
+
+ if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
+ return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
}
- if (failed(parser.parseEqual())) {
- parser.emitError(parser.getCurrentLocation(), "expected '=' sign.");
- return failure();
+ if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
+ return emitError() << "expected sg_layout and sg_data being both present or both absent";
}
- auto elemParser = [&]() -> llvm::ParseResult {
- uint32_t elem = 0;
- auto res = parser.parseInteger(elem);
- result.push_back(elem);
- return res;
- };
-
- return parser.parseCommaSeparatedList(AsmParser::Delimiter::Square,
- elemParser, fieldName);
-}
-} // namespace
-
-mlir::Attribute SGMapAttr::parse(::mlir::AsmParser &parser,
- ::mlir::Type attrType) {
- if (failed(parser.parseLess()))
- return {};
-
- llvm::SmallVector<uint32_t, 2> wi_layout, wi_data;
- if (failed(parseIntArrayField(parser, wi_layout, "wi_layout")))
- return {};
-
- if (failed(parser.parseComma()))
- return {};
-
- if (failed(parseIntArrayField(parser, wi_data, "wi_data")))
- return {};
+ if (order) {
+ if (!sg_layout)
+ return emitError() << "expected order being used with sg_layout and sg_data.";
+ if (order.size() != sg_layout.size())
+ return emitError() << "expected order having the same rank as sg_layout and sg_data";
+ }
- return SGMapAttr::getChecked(
- [&]() { return parser.emitError(parser.getNameLoc()); },
- parser.getContext(), wi_layout, wi_data);
-}
+ if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+ return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+ }
-void SGMapAttr::print(::mlir::AsmPrinter &printer) const {
- printer << "<";
- printer.printKeywordOrString("wi_layout");
- printer << " = [" << getWiLayout() << "], ";
- printer.printKeywordOrString("wi_data");
- printer << " = [" << getWiData() << "]";
- printer << ">";
-}
+ if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
+ return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
-LogicalResult
-SGMapAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
- llvm::ArrayRef<uint32_t> wi_layout,
- llvm::ArrayRef<uint32_t> wi_data) {
- if (wi_layout.size() != 2)
- return emitError() << "expected wi_layout of size 2";
- if (wi_data.size() != 2)
- return emitError() << "expected wi_data of size 2";
return success();
}
@@ -146,7 +112,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
llvm::SmallVector<int64_t> shape;
mlir::Type elementType;
mlir::FailureOr<mlir::Attribute> encoding;
- mlir::FailureOr<mlir::Attribute> sg_map;
+ mlir::FailureOr<mlir::Attribute> layout;
// Parse literal '<'
if (parser.parseLess())
@@ -169,8 +135,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
mlir::Attribute attr;
ParseResult res = parser.parseAttribute(attr);
if (mlir::succeeded(res)) {
- if (mlir::isa<SGMapAttr>(attr)) {
- sg_map = attr;
+ if (mlir::isa<LayoutAttr>(attr)) {
+ layout = attr;
continue;
}
if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
@@ -188,7 +154,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
return TensorDescType::getChecked(
[&]() { return parser.emitError(parser.getNameLoc()); },
parser.getContext(), shape, elementType,
- encoding.value_or(mlir::Attribute()), sg_map.value_or(mlir::Attribute()));
+ encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
}
void TensorDescType::print(::mlir::AsmPrinter &printer) const {
@@ -208,8 +174,8 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
if (auto encoding = getEncoding())
printer << ", " << encoding;
- if (auto sg_map = getSgMap())
- printer << ", " << sg_map;
+ if (auto layout = getLayout())
+ printer << ", " << layout;
printer << ">";
}
@@ -218,29 +184,29 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, int array_length,
bool boundary_check,
MemorySpace memory_space,
- mlir::Attribute sg_map) {
+ mlir::Attribute layout) {
auto context = elementType.getContext();
auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
boundary_check);
- return Base::get(context, shape, elementType, attr, sg_map);
+ return Base::get(context, shape, elementType, attr, layout);
}
TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, int chunk_size,
MemorySpace memory_space,
- mlir::Attribute sg_map) {
+ mlir::Attribute layout) {
auto context = elementType.getContext();
auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
- return Base::get(context, shape, elementType, attr, sg_map);
+ return Base::get(context, shape, elementType, attr, layout);
}
LogicalResult TensorDescType::verify(
llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
- mlir::Attribute encoding, mlir::Attribute sg_map) {
+ mlir::Attribute encoding, mlir::Attribute layout) {
size_t rank = shape.size();
// Low-pressure types are packed in 32-bit units.
- unsigned packingFactor = 32 / elementType.getIntOrFloatBitWidth();
+ int32_t packingFactor = 32 / elementType.getIntOrFloatBitWidth();
if (rank != 1 && rank != 2)
return emitError() << "expected 1D or 2D tensor";
@@ -274,9 +240,9 @@ LogicalResult TensorDescType::verify(
return emitError() << "SLM is not supported for 2D block tensor";
}
- if (auto sgMapAttr = llvm::dyn_cast_if_present<SGMapAttr>(sg_map)) {
- ArrayRef<uint32_t> wiLayout = sgMapAttr.getWiLayout();
- ArrayRef<uint32_t> wiData = sgMapAttr.getWiData();
+ if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
+ ArrayRef<int32_t> wiLayout = layoutAttr.getWiLayout().asArrayRef();
+ ArrayRef<int32_t> wiData = layoutAttr.getWiData().asArrayRef();
if (rank == 1) {
if (wiLayout[0] != 1 || wiData[0] != 1)
@@ -318,7 +284,7 @@ LogicalResult TensorDescType::verify(
return success();
}
-// If tensor descriptor has a sg_map attribute it is used in SIMT mode.
+// If tensor descriptor has a layout attribute it is used in SIMT mode.
// In this mode, the distributed vector shape is determined as follows:
// Definitions:
// wi_data_size = wi_data[0] × wi_data[1]
@@ -343,13 +309,13 @@ LogicalResult TensorDescType::verify(
// Distributed vector shape must be:
// [n_distribution_units, wi_data_size]
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
- auto sgMap = llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
- // If no sg_map is provided, tensor desc is not used in SIMT mode.
- if (!sgMap)
+ auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+ // If no layout is provided, tensor desc is not used in SIMT mode.
+ if (!layout || !layout.isForWorkItemLevel())
return failure();
- SmallVector<int64_t> wiData(sgMap.getWiData());
- SmallVector<int64_t> wiLayout(sgMap.getWiLayout());
+ SmallVector<int64_t> wiData(layout.getWiData().asArrayRef());
+ SmallVector<int64_t> wiLayout(layout.getWiLayout().asArrayRef());
auto tdescShape = getShape();
auto wiDataSize = 1, sgSize = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3bdf3fb218b45..c7e863256f235 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -78,18 +78,18 @@ static LogicalResult
isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
ArrayRef<int64_t> adjustedTdescShape,
function_ref<InFlightDiagnostic()> emitError) {
- auto sgMap = tdescTy.getSGMapAttr();
+ auto layout = tdescTy.getLayoutAttr();
auto valueShape = valueTy.getShape();
- // sg_map not present means IR is in SIMD mode. In this case value shape must
+ // layout not present means IR is in SIMD mode. In this case value shape must
// match adjusted tensor descriptor shape.
- if (!sgMap)
+ if (!layout || !layout.isForWorkItemLevel())
return valueShape == adjustedTdescShape
? success()
: emitError()
<< "Value shape " << makeString(valueShape)
<< " is not consistent with tensor descriptor " << tdescTy;
- // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+ // layout present means IR is in SIMT mode. In this case layout determines the
// value shape.
auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
assert(succeeded(expectedValueShapeOrFailure) &&
@@ -105,6 +105,25 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
<< " for tensor descriptor " << tdescTy;
}
+static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
+ xegpu::LayoutAttr attr) {
+ assert(attr && "workgroup map attribute is missing.");
+ llvm::ArrayRef<int32_t> layout, data;
+ if (attr.getSgLayout()) {
+ data = attr.getSgData().asArrayRef();
+ layout = attr.getSgLayout().asArrayRef();
+ } else {
+ data = attr.getWiData().asArrayRef();
+ layout = attr.getWiLayout().asArrayRef();
+ }
+ for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
+ // check s % (d * l) != 0
+ if (s % d != 0 || (s / d) % l != 0)
+ return false;
+ }
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_CreateNdDescOp
//===----------------------------------------------------------------------===//
@@ -541,7 +560,7 @@ LogicalResult StoreScatterOp::verify() {
[&]() { return emitOpError(); });
}
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
// XeGPU_UpdateOffsetOp
//===----------------------------------------------------------------------===//
void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
@@ -569,61 +588,107 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
LogicalResult DpasOp::verify() {
int64_t lhsRank = getLhsType().getRank();
int64_t rhsRank = getRhsType().getRank();
- int64_t resultRank = getResultType().getRank();
+ int64_t resRank = getResultType().getRank();
auto lhsShape = getLhsType().getShape();
auto rhsShape = getRhsType().getShape();
- auto resultShape = getResultType().getShape();
-
- auto sgMapA = getSgMapAAttr();
- auto sgMapB = getSgMapBAttr();
- auto sgMapC = getSgMapCAttr();
+ auto resShape = getResultType().getShape();
+
+ auto layoutA = getALayoutAttr();
+ auto layoutB = getBLayoutAttr();
+ auto layoutC = getCLayoutAttr();
+
+ // make sure the layout attribute is either set for every available
+ // operand or simply not set at all. C is special, since ACC is optional.
+ // If they are all set, they also should be in the same scope.
+ auto isValidSet = [&]() {
+ bool result = (layoutA != nullptr) ^ (layoutB != nullptr);
+ if (hasAcc()) {
+ result |= (layoutA != nullptr) ^ (layoutC != nullptr);
+ }
+ result = !result;
- // If sg_maps are not present, then the operation is in SIMD mode.
- if (!sgMapA && !sgMapB && !sgMapC) {
- if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+ if (layoutA) {
+ auto scope = layoutA.getScope();
+ result &= layoutB ? scope == layoutB.getScope() : false;
+ if (hasAcc())
+ result &= layoutC ? scope == layoutC.getScope() : false;
+ }
+ return result;
+ };
+
+ if (!isValidSet())
+ return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+
+ // query the scope from layoutA (a valid setting).
+ if (layoutA && layoutA.isForWorkItemLevel()) {
+ // In SIMT mode, All data fragments must be 2D
+ if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+ return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+ auto wiLayoutA = layoutA.getWiLayout();
+ auto wiLayoutB = layoutB.getWiLayout();
+ auto wiLayoutC = layoutC.getWiLayout();
+ // Obtain the expanded shapes of the operands and result using wi_layout.
+ // NOTE: For B, get rid of the packed dimension for the expanded shape.
+ SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+ lhsShape[1] * wiLayoutA[1]};
+ SmallVector<int64_t> expandedShapeB = {
+ rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+ SmallVector<int64_t> expandedShapeC = {resShape[0] * wiLayoutC[0],
+ resShape[1] * wiLayoutC[1]};
+ auto bK = expandedShapeB[0];
+ if (bK != expandedShapeA[1])
+ return emitOpError("K-dimension mismatch.");
+ if (expandedShapeA[0] != expandedShapeC[0])
+ return emitOpError("M-dimension mismatch.");
+ if (expandedShapeB[1] != expandedShapeC[1])
+ return emitOpError("N-dimension mismatch.");
+ } else { // For other scopes, operands' shape should match the mxkxn semantics.
+ if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
return emitOpError(
"expecting lhs and result to be a 2D vector, and rhs to be either "
"2D or 3D (packed) vector.");
auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
if (bK != lhsShape[1])
return emitOpError("K-dimension mismatch.");
- if (lhsShape[0] != resultShape[0])
+ if (lhsShape[0] != resShape[0])
return emitOpError("M-dimension mismatch.");
- if (rhsShape[1] != resultShape[1])
+ if (rhsShape[1] != resShape[1])
return emitOpError("N-dimension mismatch.");
- return success();
}
- // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
- // result of DPAS operation.
- if (!sgMapA || !sgMapB || !sgMapC)
- return emitOpError("sg_map attributes for all operands and outputs are "
- "expected in SIMT xegpu::Dpas operation");
-
- // In SIMT mode, All data fragments must be 2D
- if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
- return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
- auto wiLayoutA = sgMapA.getWiLayout();
- auto wiLayoutB = sgMapB.getWiLayout();
- auto wiLayoutC = sgMapC.getWiLayout();
- // Obtain the expanded shapes of the operands and result using wi_layout.
- // NOTE: For B, get rid of the packed dimension for the expanded shape.
- SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
- lhsShape[1] * wiLayoutA[1]};
- SmallVector<int64_t> expandedShapeB = {
- rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
- SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
- resultShape[1] * wiLayoutC[1]};
- auto bK = expandedShapeB[0];
- if (bK != expandedShapeA[1])
- return emitOpError("K-dimension mismatch.");
- if (expandedShapeA[0] != expandedShapeC[0])
- return emitOpError("M-dimension mismatch.");
- if (expandedShapeB[1] != expandedShapeC[1])
- return emitOpError("N-dimension mismatch.");
-
return success();
}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_ConvertLayoutOp
+//===----------------------------------------------------------------------===//
+LogicalResult ConvertLayoutOp::verify() {
+ auto srcMap = getSrcMapAttr();
+ auto resMap = getResMapAttr();
+ if (!srcMap)
+ return emitOpError("expected srcMap.");
+ if (!resMap)
+ return emitOpError("expected resMap.");
+
+ if (srcMap.getScope() != resMap.getScope())
+ return emitOpError("expected srcMap and resMap be in the same scope.");
+
+ if (srcMap == resMap)
+ return emitOpError("expected different srcMap and resMap.");
+
+ if (srcMap.isForWorkItemLevel())
+ return emitOpError("doesn't work on SIMT code.");
+
+ auto shape = getSource().getType().getShape();
+ if (!isEvenDistributed(shape, srcMap))
+ return emitOpError("invalid srcMap, data cannot be evenly distributed.");
+
+ if (!isEvenDistributed(shape, resMap))
+ return emitOpError("invalid resMap, data cannot be evenly distributed.");
+
+ return mlir::success();
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 88e9bbf78945b..c4958d920a89f 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -78,25 +78,25 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
}
// -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-> vector<8x2xf32>
return
}
// -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-> vector<8xf32>
return
}
@@ -134,22 +134,22 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
}
// -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
// -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
@@ -245,69 +245,69 @@ func.func @test_prefetch_vc_2(%src: ui64) {
}
// -----
-func.func @test_create_tdesc_sg_map_1(%src: ui64) {
+func.func @test_create_tdesc_layout_1(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
return
}
// -----
-func.func @test_create_tdesc_sg_map_2(%src: ui64) {
+func.func @test_create_tdesc_layout_2(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [2, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [2, 1]>>
return
}
// -----
-func.func @test_create_tdesc_sg_map_3(%src: ui64) {
+func.func @test_create_tdesc_layout_3(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
return
}
// -----
-func.func @test_load_gather_sg_map_1(%src: ui64) {
+func.func @test_load_gather_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
return
}
// -----
-func.func @test_load_gather_sg_map_2(%src: ui64) {
+func.func @test_load_gather_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
return
}
// -----
-func.func @test_store_scatter_sg_map_1(%src: ui64) {
+func.func @test_store_scatter_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<1x2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
return
}
// -----
-func.func @test_store_scatter_sg_map_2(%src: ui64) {
+func.func @test_store_scatter_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
return
}
@@ -394,18 +394,18 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
}
// -----
-func.func @test_dpas_sg_map_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // expected-error at +1 {{sg_map attributes for all operands and outputs are expected in SIMT xegpu::Dpas operation}}
- %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
+ // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
return
}
// -----
-func.func @test_dpas_sg_map_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
+func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
// expected-error at +1 {{K-dimension mismatch}}
- %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
- sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
- sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+ b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+ c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
: vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
return
}
@@ -439,7 +439,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [2, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [2, 16], wi_data = [1, 1]>>
return
}
@@ -447,7 +447,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
return
}
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 1]>>
return
}
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [4, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [2, 8], wi_data = [4, 1]>>
return
}
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 2]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 2]>>
return
}
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
!xegpu.tensor_desc<4x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.sg_map<wi_layout = [1, 1], wi_data = [2, 1]>>
+ #xegpu.layout<scope = wi, wi_layout = [1, 1], wi_data = [2, 1]>>
return
}
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+ #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
return
}
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+ #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
return
}
@@ -520,6 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected chunk blocks for 2D tensor}}
!xegpu.tensor_desc<16x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.sg_map<wi_layout = [8, 1], wi_data = [1, 2]>>
+ #xegpu.layout<scope = wi, wi_layout = [8, 1], wi_data = [1, 2]>>
return
}
+
+// -----
+func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+ // expected-error at +1 {{expected different srcMap and resMap}}
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>,
+ resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
+
+// -----
+func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+ // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+ resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index c32f1905454b6..6a29a73a20612 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
%2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
gpu.return
}
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
gpu.return
}
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
gpu.return
}
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
gpu.return
}
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
gpu.return
}
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
gpu.return
}
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
%1 = arith.constant dense<1.0>: vector<48x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
%1 = arith.constant dense<1.0>: vector<2x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
gpu.func @test_create_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
gpu.return
}
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
gpu.return
}
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
gpu.return
}
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
gpu.func @test_create_tdesc_simt_3(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
gpu.return
}
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
gpu.return
}
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
gpu.return
}
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
gpu.return
}
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
%2 = arith.constant dense<2.9>: vector<2x1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
%2 = arith.constant dense<2.9>: vector<1x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
gpu.return
}
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
%2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
gpu.func @test_prefetch_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
gpu.return
}
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
gpu.func @test_create_update_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
//CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
%s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
gpu.return
}
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
- // CHECK: sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
- // CHECK: sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
- %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
- sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
- sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+ // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+ // CHECK: b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+ // CHECK: c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+ b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+ c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
: vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
gpu.return
}
@@ -704,4 +704,24 @@ gpu.func @fence() {
gpu.return
}
+// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+ gpu.return
+}
+
+gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [2, 1]>,
+ resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
+
+gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+ resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
+
+
}
>From cb2697927bc75b00abd03a39ffb0698ba8b9e0a4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 17:14:59 +0000
Subject: [PATCH 2/4] format code
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 35 ++++++++++++----------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 7 +++--
2 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 52b9f2c192b3f..5e21bb805a6a5 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -72,34 +72,39 @@ LogicalResult ScatterTensorDescAttr::verify(
//===----------------------------------------------------------------------===//
LogicalResult
LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
- ScopeAttr scope,
- DenseI32ArrayAttr sg_layout,
- DenseI32ArrayAttr sg_data,
- DenseI32ArrayAttr order,
- DenseI32ArrayAttr wi_layout,
- DenseI32ArrayAttr wi_data) {
-
- if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
- return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
+ ScopeAttr scope, DenseI32ArrayAttr sg_layout,
+ DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
+ DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
+
+ if (scope && scope.getValue() != Scope::WG &&
+ (sg_layout || sg_data || order)) {
+ return emitError() << "expected sg_layout, sg_data, and order being only "
+ "used at workgroup level.";
}
if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
- return emitError() << "expected sg_layout and sg_data being both present or both absent";
+ return emitError() << "expected sg_layout and sg_data being both present "
+ "or both absent";
}
if (order) {
if (!sg_layout)
- return emitError() << "expected order being used with sg_layout and sg_data.";
+ return emitError()
+ << "expected order being used with sg_layout and sg_data.";
if (order.size() != sg_layout.size())
- return emitError() << "expected order having the same rank as sg_layout and sg_data";
+ return emitError()
+ << "expected order having the same rank as sg_layout and sg_data";
}
- if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
- return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+ if (sg_layout &&
+ (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+ return emitError() << "expected sg_layout and sg_data having the same "
+ "rank, which is not larger than 2";
}
if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
- return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
+ return emitError() << "expected wi_layout and wi_data having the same "
+ "rank, which is not larger than 2";
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index c7e863256f235..66b5054278c8c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -617,7 +617,9 @@ LogicalResult DpasOp::verify() {
};
if (!isValidSet())
- return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+ return emitOpError(
+ "layout attributes should be either set for all operands (for SIMT "
+ "code) or not set at all (for SIMD code).");
// query the scope from layoutA (a valid setting).
if (layoutA && layoutA.isForWorkItemLevel()) {
@@ -643,7 +645,8 @@ LogicalResult DpasOp::verify() {
return emitOpError("M-dimension mismatch.");
if (expandedShapeB[1] != expandedShapeC[1])
return emitOpError("N-dimension mismatch.");
- } else { // For other scopes, operands' shape should match the mxkxn semantics.
+ } else { // For other scopes, operands' shape should match the mxkxn
+ // semantics.
if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
return emitOpError(
"expecting lhs and result to be a 2D vector, and rhs to be either "
>From 273fc408a1c63fe3d4100708cad190f01b6d2523 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 17:19:07 +0000
Subject: [PATCH 3/4] remove changes to prefetch op
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6b27ae3b2754c..a3ee6e901a775 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
let hasVerifier = 1;
}
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
let summary = "prefetches a set of scattered data points to cache";
let description = [{
>From 504d2748efb1ad3d29a3187a5e692d58247a3bdd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 18:06:52 +0000
Subject: [PATCH 4/4] refine the doc for TensorDesc
---
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 43 +++++++++++--------
1 file changed, 24 insertions(+), 19 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index c92ea42efde3b..82d6a4ec39e6b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -34,27 +34,24 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
[ShapedTypeInterface], "::mlir::TensorType"> {
let summary = "TensorDesc describing regions of interested data.";
let description = [{
- TensorDesc is a type designed to describe regions of the interested data as well as some
- features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
- it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
- to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
- It encodes the following information:
+ TensorDesc is a type designed to describe regions of interest in data, as well as some features
+ unique to Intel hardware. Unlike the built-in tensor type in MLIR, it essentially contains only
+ metadata and does not hold the data itself. It is primarily designed to support 2D block load/store
+ and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information:
* shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
and each row contains 16 contiguous data element. The rows could be
- either contiguous or not, depends on whether the encoding attribute
- is set or not.
- * element_type: the data type of the data element, e.g., f16, f32.
+ either contiguous or not, depends on the encoding attribute. If the
+ encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding
+ is a ScatterTensorDescAttr, rows are not necessary to be contiguous. If
+ encoding is not set, it is considered as a default BlockTensorDescAttr.
- Similar to the builtin tensor, it also provides an optinal attribute to encoding
- the following information via the TensorDescAttr object:
- * memory_space (xegpu::MemorySpace): [optional] where the data is located,
- global memory or shared memory. It is default to Global.
- * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
- that will be loaded by block load at a time. It is default to 1.
- * boundary_check (bool): [optional] indicates whether the operation detects the boundary
- and pads with zero for out-of-boundary access. It is default to do boundary check.
+ * element_type: the data type of the data element, e.g., f16, f32.
+ Similar to the built-in tensor, it also provides optional attributes for encoding
+ additional information via either BlockTensorDescAttr or ScatterTensorDescAttr, or
+ supporting Workgroup, Subgroup, and workitem (or SIMT) level programmings via the
+ Layout attribute. Please check their definition for details.
Syntax:
@@ -63,7 +60,9 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
element-type ::= float-type | integer-type | index-type
dim-list := (static-dim-list `x`)?
static-dim-list ::= decimal-literal `x` decimal-literal
- attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
+ attr-list = (, encoding-attr)? (, layout-attr)?
+ enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
+ layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? wi_layout = value, wi_data = value `>`)?
```
Examples:
@@ -78,8 +77,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
- // A TensorDesc with a layout
- xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // A TensorDesc with a layout for workgroup level programming
+ xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
+
+ // A TensorDesc with a layout for subgroup level programming
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>>
+
+ // A TensorDesc with a layout for workitem level programming
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
More information about the Mlir-commits
mailing list