[Mlir-commits] [mlir] bbc6a54 - [MLIR][XeGPU] Remove create tdesc & update offset op from xegpu dialect (#182804)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Apr 15 14:53:02 PDT 2026
Author: Nishant Patel
Date: 2026-04-15T14:52:57-07:00
New Revision: bbc6a54f54467de1a459298d8a8390307223f3df
URL: https://github.com/llvm/llvm-project/commit/bbc6a54f54467de1a459298d8a8390307223f3df
DIFF: https://github.com/llvm/llvm-project/commit/bbc6a54f54467de1a459298d8a8390307223f3df.diff
LOG: [MLIR][XeGPU] Remove create tdesc & update offset op from xegpu dialect (#182804)
This PR removes create_tdesc and update_offset ops from the XeGPU
dialect, as scatter load/store/prefetch now accept memref+offsets
directly.
Added:
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/Dialect/XeGPU/invalid.mlir
mlir/test/Dialect/XeGPU/ops.mlir
mlir/test/Dialect/XeGPU/propagate-layout.mlir
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index a7bea9881602f..313a4355701a8 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -72,49 +72,6 @@ def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_td
}
-def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
- let summary = [{a composite attribute for `TensorDescType`}];
- let description = [{
- `ScatterTensorDesc` is a composite attribute defined for `TensorDescType`
- for describing following properties of a `TensorDesc`:
-
- 1. `memory_space`: It describes where the data block described by the
- TensorDesc is located, `Global` device memory or `Shared` local memory.
- It is default to `Global`.
-
- 2. `chunk_size`: Specifies the number of contiguous elements accessed per offset.
- The default value is 1.
- }];
-
- let parameters = (ins
- DefaultValuedParameter<
- "MemorySpaceAttr",
- "MemorySpaceAttr::get($_ctxt, xegpu::MemorySpace::Global)",
- "Data memory location"
- >: $memory_space,
- DefaultValuedParameter<
- "IntegerAttr",
- "IntegerAttr::get(IntegerType::get($_ctxt, 64), 1)",
- "Number of contiguous elements"
- >: $chunk_size
- );
-
- let builders = [
- AttrBuilder<(ins
- CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
- CArg<"int", "1">: $chunk_size
- )>
- ];
-
- let extraClassDeclaration = [{
- int64_t getChunkSizeAsInt() {
- return getChunkSize().getInt();
- }
- }];
-
- let genVerifyDecl = 1;
- }
-
//===----------------------------------------------------------------------===//
// XeGPU Memory Scope Enums.
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index f9c3c155a32d5..31fe93d209a6d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -648,107 +648,6 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
let hasVerifier = 1;
}
-def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
- let summary = "create scattered tensor descriptors (TensorDesc).";
- let description = [{
- "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
- a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
- is for creating continuous subviews, "create_tdesc" is for creating non-continuous
- (scattered) subviews, allowing each lane in a subgroup specifying their own offset.
- It accepts the following parameters:
-
- Arguments:
-
- - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
- memory object.
-
- - `offsets`: a vector containing offsets of each access point. Its size
- is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
- implying each element in the vector corresponds to a SIMT lane in the subgroup.
-
- Results:
- - `res`: scattered tensor descriptor
-
- The first dimension of the result TensorDesc corresponds to lanes, so it should
- match the dimension of offsets. It may also has a second dimension corresponding to
- the chunk_size if the chunk size is larger than 1.
-
- Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
- ```mlir
- %a = memref.alloc() : memref<1024xf32>
- %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
- %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32>
- ```
-
- Example 2: It assumes subgroup size is 4, and each workitem access 8 elements.
- It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
- ```mlir
- %0 = memref.alloc() : memref<1024xf32>
- %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
- ```
-
- Example 3: It is similar to Example 2, but there is some overlaps among workitems.
- It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
- ```mlir
- %0 = memref.alloc() : memref<1024xf32>
- %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex>
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
- ```
- }];
-
- let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source,
- XeGPU_OffsetType:$offsets);
- let results = (outs XeGPU_TensorDesc:$TensorDesc);
-
- let builders = [
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
- "llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
- "llvm::ArrayRef<int64_t>": $offsets)>,
- ];
-
- let assemblyFormat = [{
- $source `,` $offsets attr-dict `:` type($source) `,` type($offsets) `->` qualified(type($TensorDesc))
- }];
-
- let extraClassDeclaration = [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
-
- mlir::VectorType getOffsetsType() {
- return getOffsets().getType();
- }
-
- size_t getNumOffsets() {
- return getOffsetsType().getNumElements();
- }
-
- mlir::Value getViewSource() { return getSource(); }
-
- unsigned getSourceMemorySpace() {
- auto srcTy = getSource().getType();
- if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
- auto attr = memrefTy.getMemorySpace();
- if (attr) {
- if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
- return static_cast<unsigned>(intAttr.getInt());
- if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
- return static_cast<unsigned>(memSpaceAttr.getValue());
- }
- }
- // take global as default memory scope.
- return static_cast<unsigned>(MemorySpace::Global);
- }
-
- }];
-
- let hasVerifier = 1;
-}
-
def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
let summary = "prefetches a set of scattered data points to cache";
@@ -764,11 +663,9 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
Arguments:
- `source`: represents the memory region to be loaded from, which can be either a
- tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
- tensor_desc cannot be used at lane level.
+ 1D memref or pointer (ui64, ui32, i64 or i32).
- - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from source.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 at lane level. scalar offset is also valid for lane level.
@@ -791,10 +688,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
```
Example 2 (lane level):
- A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
- It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
+ A variant accepts memref or integer (raw pointer) as base and offsets directly.
The source operand could be a raw pointer (ui64, ui32, i64, i32).
- Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
%0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
@@ -896,11 +791,9 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
Arguments:
- `source`: represents the memory region to be loaded from, which can be either a
- tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
- tensor_desc cannot be used at lane level.
+ 1D memref or pointer (ui64, ui32, i64 or i32).
- - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from source.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 at lane level. scalar offset is also valid for lane level.
@@ -918,32 +811,9 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
Results:
- `res`: represents loaded data
-
- Example 1 (Workgroup level):
- ```mlir
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>},
- layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>>
- : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
- vector<256xi1> -> vector<256xf32>
- ```
-
- Example 2 (Subgroup level):
- ```mlir
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>},
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>>
- : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
- vector<16xi1> -> vector<16x8xf32>
- ```
-
- Example 3 (Subgroup level):
- A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
- It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
- The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc
- for the restriction of memref.
+ Example 1 (Subgroup level):
+ A variant accepts memref as base pointer or the source operand
+ could be a raw pointer (ui64, ui32, i64, i32).
```mlir
%a = memref.alloc() : memref<1024xf32>
%offsets = vector.step : vector<16xindex>
@@ -955,7 +825,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
: memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
```
- Example 4 (lane level):
+ Example 2 (lane level):
lane level only accepts the offsets variant. chunk_size can be inferred from result
type. In this example, chunk_size is 8.
```mlir
@@ -1067,11 +937,9 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
- `value`: represents the data to be stored.
- `dest`: represents the memory region to be stored to, which can be either a
- tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
- tensor_desc cannot be used at lane level.
+ 1D memref or pointer (ui64, ui32, i64 or i32).
- - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from dest.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 at lane level. scalar offset is also valid for lane level.
@@ -1087,29 +955,9 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
to be stored. Only valid at workgroup and subgroup levels.
- Example 1 (Workgroup level):
- ```mlir
- xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>,
- layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}>
- : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1>
- ```
-
- Example 2 (Subgroup level):
- ```mlir
- xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>,
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}>
- : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
- ```
-
- Example 3 (Subgroup level):
- A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
- It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
+ Example 1 (Subgroup level):
+ A variant accepts memref as base pointer and an offset.
The dest operand could be a raw pointer (uint64_t).
- Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
%val = arith.constant dense<0.0> : vector<16xf32>
@@ -1122,7 +970,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
: vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex>
```
- Example 4 (Lane level):
+ Example 2 (Lane level):
Lane level IR only accepts the offsets variant. chunk_size can be inferred from value
type. In this example, chunk_size is 8.
```mlir
@@ -1213,59 +1061,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
let hasVerifier = 1;
}
-def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
- [AllTypesMatch<["TensorDesc", "result"]>]> {
- let summary = "It updates the offsets for the given tensor descriptor";
-
- let description = [{It behaves similar to `update_nd_offset` in terms that
- it updates offset of a TensorDesc, and the offsets are relative offset to
- the current position in the number of elements. However, `update_nd_offset`
- is to update the start point of a 2D block, so its offset constains two
- elements representing the shift in each dimension. `update_offset` is to
- update the offset per lane, so its offsets contains values representing
- shifts for each lane.
-
- Example:
- ```mlir
- %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %off :
- !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>>, vector<4xindex>
- ```
-
- }];
-
- let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
- XeGPU_OffsetType: $offsets);
- let results = (outs XeGPU_TensorDesc: $result);
-
- let builders = [
- OpBuilder<(ins "mlir::Value": $TensorDesc,
- "llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "mlir::Value": $TensorDesc,
- "llvm::ArrayRef<int64_t>": $offsets)>
- ];
-
- let extraClassDeclaration = [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
-
- mlir::VectorType getOffsetsType() {
- return getOffsets().getType();
- }
-
- size_t getNumOffsets() {
- return getOffsetsType().getNumElements();
- }
- }];
-
- let assemblyFormat = [{
- $TensorDesc `,` $offsets attr-dict `:` qualified(type($TensorDesc)) `,` type($offsets)
- }];
-
- let hasVerifier = 1;
-}
-
def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> {
let summary = "It performs mma computation";
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index b13f5a9f2c9d9..33eab14e9dfd8 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -60,18 +60,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information:
* shape: the sizes/shape of the interested data block, e.g., 8x16 means 8 rows
- and each row contains 16 contiguous data element. The rows could be
- either contiguous or not, depends on the encoding attribute. If the
- encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding
- is a ScatterTensorDescAttr, rows are not necessary to be contiguous. If
- encoding is not set, it is considered as a default BlockTensorDescAttr.
+ and each row contains 16 contiguous data elements.
* element_type: the data type of the data element, e.g., f16, f32.
Similar to the built-in tensor, it also provides optional attributes for encoding
- additional information via either BlockTensorDescAttr or ScatterTensorDescAttr, or
- supporting Workgroup, Subgroup, and workitem (or SIMT) level programmings via the
- Layout attribute. Please check their definition for details.
+ additional information via BlockTensorDescAttr, or supporting Workgroup & Subgroup
+ level programmings via the Layout attribute. Please check their definition for details.
Syntax:
@@ -81,7 +76,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
dim-list := (static-dim-list `x`)?
static-dim-list ::= decimal-literal `x` decimal-literal
attr-list = (, encoding-attr)? (, layout-attr)?
- enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
+ enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)?
layout-attr = DistributeLayoutAttr
```
@@ -127,12 +122,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
CArg<"int", "1">: $array_length,
CArg<"bool", "true">: $boundary_check,
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
- CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>,
- TypeBuilderWithInferredContext<(ins
- "llvm::ArrayRef<int64_t>": $shape,
- "mlir::Type": $elementType,
- CArg<"int", "1">: $chunk_size,
- CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>
];
@@ -150,12 +139,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
}
- template <typename T,
- typename = std::enable_if_t<
- std::is_same_v<T, BlockTensorDescAttr> ||
- std::is_same_v<T, ScatterTensorDescAttr>>>
- T getEncodingOfType() const {
- return llvm::dyn_cast_if_present<T>(getEncoding());
+ BlockTensorDescAttr getBlockAttr() const {
+ return llvm::dyn_cast_if_present<BlockTensorDescAttr>(getEncoding());
}
DistributeLayoutAttr getLayoutAttr() const {
@@ -163,35 +148,16 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
}
xegpu::MemorySpace getMemorySpace() const {
- if (auto attr = getEncodingOfType<BlockTensorDescAttr>())
- return attr.getMemorySpace().getValue();
-
- auto attr = getEncodingOfType<ScatterTensorDescAttr>();
- return attr.getMemorySpace().getValue();
+ return getBlockAttr().getMemorySpace().getValue();
}
// get the ArrayLength for blocked TensorDesc
int getArrayLength() {
- auto attr = getEncodingOfType<BlockTensorDescAttr>();
- assert(attr && "invalid on non BlockTensorDescAttr.");
- return attr.getArrayLength().getInt();
+ return getBlockAttr().getArrayLength().getInt();
}
bool getBoundaryCheck() {
- auto attr = getEncodingOfType<BlockTensorDescAttr>();
- assert(attr && "invalid on non BlockTensorDescAttr.");
- return attr.getBoundaryCheck().getValue();
- }
-
- bool isScattered() {
- return bool(getEncodingOfType<ScatterTensorDescAttr>());
- }
-
- // get the ChunkSize for scattered TensorDesc
- int getChunkSizeAsInt() {
- auto attr = getEncodingOfType<ScatterTensorDescAttr>();
- assert(attr && "invalid on non ScatterTensorDescAttr.");
- return attr.getChunkSizeAsInt();
+ return getBlockAttr().getBoundaryCheck().getValue();
}
/// Helper to drop all layout information from the TensorDesc type.
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 93da74e938c84..50eba56a16080 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -1118,9 +1118,6 @@ struct ConvertXeGPUToXeVMPass
return VectorType::get(sum, elemType);
});
typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type {
- // Scattered descriptors are not supported in XeVM lowering.
- if (type.isScattered())
- return {};
if (type.getRank() == 1)
return xevmIndexType;
return VectorType::get(8, i32Type);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 80a3fc91f1c4f..811b09b011e47 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -140,28 +140,6 @@ bool BlockTensorDescAttr::hasDefaultsOnly() {
getArrayLength().getInt() == 1 && getBoundaryCheck().getValue();
}
-//===----------------------------------------------------------------------===//
-// XeGPU_ScatterTensorDescAttr
-//===----------------------------------------------------------------------===//
-ScatterTensorDescAttr
-ScatterTensorDescAttr::get(mlir::MLIRContext *context,
- xegpu::MemorySpace memory_space, int chunk_size) {
- auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
- auto chunkSizeAttr =
- IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
- return Base::get(context, scopeAttr, chunkSizeAttr);
-}
-
-LogicalResult ScatterTensorDescAttr::verify(
- llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
- MemorySpaceAttr memory_space, IntegerAttr chunk_size) {
- int64_t chunkSize = chunk_size.getInt();
- if (chunkSize <= 0)
- return emitError() << "invalid chunk size";
-
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_LayoutAttr
//===----------------------------------------------------------------------===//
@@ -1254,7 +1232,7 @@ mlir::Type TensorDescType::parse(AsmParser &parser) {
layout = attr;
continue;
}
- if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
+ if (mlir::isa<BlockTensorDescAttr>(attr)) {
encoding = attr;
continue;
}
@@ -1309,15 +1287,6 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
return Base::get(context, shape, elementType, attr, layout);
}
-TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
- mlir::Type elementType, int chunk_size,
- MemorySpace memory_space,
- mlir::Attribute layout) {
- auto *context = elementType.getContext();
- auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
- return Base::get(context, shape, elementType, attr, layout);
-}
-
LogicalResult
TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
@@ -1339,30 +1308,6 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
return emitError() << "unsupported element type " << elementType
<< ": expected integer or float";
- // for gather and scatter ops, Low-precision types are packed in 32-bit
- // units.
- unsigned bitWidth = elementType.getIntOrFloatBitWidth();
- int chunkAlignmentFactor =
- bitWidth < xegpu::uArch::generalPackedFormatBitSize
- ? xegpu::uArch::generalPackedFormatBitSize / bitWidth
- : 1;
- auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
- if (scatterAttr) {
- int64_t chunkSize = scatterAttr.getChunkSizeAsInt();
- if (rank == 1 && chunkSize != 1)
- return emitError() << "expected non-contiguous elements for 1D tensor";
-
- // If chunk size > 1, the second dimension of the tensor shape must be
- // equal to chunk size and it must be a multiple of the
- // chunkAlignmentFactor.
- if (chunkSize > 1) {
- if (shape.back() != chunkSize)
- return emitError() << "expected last dim of tensor to match chunk size";
- if (shape.back() % chunkAlignmentFactor != 0)
- return emitError() << "expected last dim of tensor to be a multiple of "
- << chunkAlignmentFactor;
- }
- }
if (auto layoutAttr =
mlir::dyn_cast_if_present<DistributeLayoutAttr>(layout)) {
if (rank != (size_t)layoutAttr.getRank())
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 43c2e0aa37f22..51ce6ce53a2fe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -74,53 +74,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
}
-static LogicalResult
-isValidGatherScatterParams(Type maskTy, VectorType valueTy,
- TensorDescType tdescTy,
- function_ref<InFlightDiagnostic()> emitError) {
-
- if (!tdescTy.isScattered())
- return emitError() << "Expects a scattered TensorDesc.";
-
- auto chunkSize = tdescTy.getChunkSizeAsInt();
- if (!valueTy) {
- if (chunkSize > 1)
- return emitError() << "Expecting chunk size == 1 for scalar result";
- if (dyn_cast<VectorType>(maskTy))
- return emitError() << "Expecting a vector type result.";
- return success();
- }
-
- auto maskShape = getShapeOf(maskTy);
- auto valueShape = getShapeOf(valueTy);
- auto tdescShape = getShapeOf(tdescTy);
-
- if (valueTy.getElementType() != tdescTy.getElementType())
- return emitError()
- << "Value should have the same element type as TensorDesc.";
-
- llvm::SmallVector<int64_t> expectedMaskShape(tdescShape);
- if (chunkSize > 1)
- expectedMaskShape.pop_back();
- if (expectedMaskShape != maskShape)
- return emitError()
- << "Mask should match TensorDesc except the chunk size dim.";
-
- // a valid shape for SIMT case
- if (valueTy.getRank() == 1 && valueTy.getNumElements() == chunkSize) {
- if (tdescTy.getLayoutAttr())
- return emitError() << "TensorDesc doesn't need LayoutAttr for SIMT code";
- return success();
- }
-
- if (tdescShape != valueShape)
- return emitError() << "Value shape " << makeString(valueShape)
- << " is neither a valid distribution for SIMT nor "
- "consistent with the tensor descriptor for SIMD "
- << tdescTy;
- return success();
-}
-
static LogicalResult
isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
VectorType valueTy, int64_t chunkSize,
@@ -408,9 +361,6 @@ LogicalResult CreateNdDescOp::verify() {
return emitOpError("TensorDesc should have the same element "
"type with the source if it is a memref.\n");
- if (getType().isScattered())
- return emitOpError("Expects a non-scattered TensorDesc.\n");
-
return success();
}
@@ -491,8 +441,6 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
LogicalResult PrefetchNdOp::verify() {
auto tdescTy = getTensorDescType();
- if (tdescTy.isScattered())
- return emitOpError("Expects a non-scattered TensorDesc.\n");
if (!isReadHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
@@ -556,9 +504,6 @@ LogicalResult LoadNdOp::verify() {
auto tdescTy = getTensorDescType();
auto valueTy = getType();
- if (tdescTy.isScattered())
- return emitOpError("Expects a non-scattered TensorDesc.\n");
-
if (tdescTy.getRank() > 2)
return emitOpError("Expects a 1D or 2D TensorDesc.\n");
@@ -682,9 +627,6 @@ LogicalResult StoreNdOp::verify() {
auto dstTy = getTensorDescType(); // Tile
auto valTy = getValueType(); // Vector
- if (dstTy.isScattered())
- return emitOpError("Expects a non-scattered TensorDesc.\n");
-
if (dstTy.getRank() > 2)
return emitOpError("Expects a 1D or 2D TensorDesc.\n");
@@ -752,8 +694,6 @@ LogicalResult StoreNdOp::verify() {
//===----------------------------------------------------------------------===//
LogicalResult UpdateNdOffsetOp::verify() {
auto ty = getTensorDescType();
- if (ty.isScattered())
- return emitOpError("Expects a non-scattered TensorDesc.\n");
// number of offsets specified must match the rank of the tensor descriptor
if (ty.getRank() != (int64_t)getNumOffsets()) {
@@ -762,59 +702,6 @@ LogicalResult UpdateNdOffsetOp::verify() {
return success();
}
-//===----------------------------------------------------------------------===//
-// XeGPU_CreateDescOp
-//===----------------------------------------------------------------------===//
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
- TensorDescType TensorDesc, Value source,
- llvm::ArrayRef<OpFoldResult> offsets) {
- auto loc = source.getLoc();
- int64_t size = static_cast<int64_t>(offsets.size());
- auto type = VectorType::get(size, builder.getIndexType());
- auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
- auto offset = vector::FromElementsOp::create(builder, loc, type, values);
- build(builder, state, TensorDesc, source, offset);
-}
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
- TensorDescType TensorDesc, Value source,
- llvm::ArrayRef<int64_t> offsets) {
- auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
- build(builder, state, TensorDesc, source, ofrs);
-}
-
-LogicalResult CreateDescOp::verify() {
- auto tdescTy = getTensorDescType();
-
- if (!tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.\n");
-
- // Memory space of created TensorDesc should match with the source.
- // Both source and TensorDesc are considered for global memory by default,
- // if the memory scope attr is not specified. If source is an integer,
- // it is considered as ptr to global memory.
- auto srcMemorySpace = getSourceMemorySpace();
- auto tdescMemorySpace = static_cast<unsigned>(tdescTy.getMemorySpace());
- if (srcMemorySpace != tdescMemorySpace)
- return emitOpError("Memory space mismatch.")
- << " Source: " << srcMemorySpace
- << ", TensorDesc: " << tdescMemorySpace;
-
- // check total size
- auto chunkSize = tdescTy.getChunkSizeAsInt();
- SmallVector<int64_t> shape(getOffsetsType().getShape());
- if (chunkSize != 1)
- shape.push_back(chunkSize);
-
- auto tdescShape = getShapeOf(tdescTy);
- if (shape != tdescShape)
- return emitOpError("Incorrect TensorDesc shape. ")
- << "Expected is " << makeString(shape) << "\n";
-
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_PrefetchOp
//===----------------------------------------------------------------------===//
@@ -827,9 +714,6 @@ LogicalResult PrefetchOp::verify() {
if (tdescTy && getOffsets())
return emitOpError("offsets not allowed.");
- if (tdescTy && !tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.");
-
if (!isReadHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
@@ -881,9 +765,6 @@ LogicalResult LoadGatherOp::verify() {
if (tdescTy && getOffsets())
return emitOpError("offsets not allowed.");
- if (tdescTy && !tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.");
-
if (!isReadHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
@@ -893,9 +774,6 @@ LogicalResult LoadGatherOp::verify() {
if (!isReadHintOrNone(getL3HintAttr()))
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
- if (tdescTy)
- return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
- [&]() { return emitOpError(); });
auto srcTy = getSourceType();
uint64_t chunkSize = static_cast<int64_t>(getChunkSize().value_or(1));
auto memTy = dyn_cast<MemRefType>(srcTy);
@@ -969,9 +847,6 @@ LogicalResult StoreScatterOp::verify() {
if (tdescTy && getOffsets())
return emitOpError("offsets not allowed.");
- if (tdescTy && !tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.");
-
if (!isWriteHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
@@ -981,10 +856,6 @@ LogicalResult StoreScatterOp::verify() {
if (!isWriteHintOrNone(getL3HintAttr()))
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
- if (tdescTy)
- return isValidGatherScatterParams(maskTy, valueTy, tdescTy,
- [&]() { return emitOpError(); });
-
auto destTy = getDestType();
uint64_t chunkSize = static_cast<int64_t>(getChunkSize().value_or(1));
auto memTy = dyn_cast<MemRefType>(destTy);
@@ -1045,45 +916,6 @@ void StoreScatterOp::build(
l3_hint, layout);
}
-//===----------------------------------------------------------------------===//
-// XeGPU_UpdateOffsetOp
-//===----------------------------------------------------------------------===//
-void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
- mlir::Value tensorDesc,
- llvm::ArrayRef<OpFoldResult> offsets) {
- auto tdescTy = mlir::dyn_cast<TensorDescType>(tensorDesc.getType());
- assert(tdescTy && "Expecting the source is a TensorDescType value.");
- auto loc = tensorDesc.getLoc();
- int64_t size = static_cast<int64_t>(offsets.size());
- auto type = VectorType::get({size}, builder.getIndexType());
- auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
- auto offset = vector::FromElementsOp::create(builder, loc, type, values);
- build(builder, state, tdescTy, tensorDesc, offset);
-}
-
-void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
- Value tensorDesc, llvm::ArrayRef<int64_t> offsets) {
- auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
- build(builder, state, tensorDesc, ofrs);
-}
-
-LogicalResult UpdateOffsetOp::verify() {
- auto tdescTy = getTensorDescType();
- if (!tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.\n");
-
- SmallVector<int64_t> expectedOffsetShape = getShapeOf(tdescTy);
- SmallVector<int64_t> offsetShape = getShapeOf(getOffsetsType());
- if (tdescTy.getChunkSizeAsInt() > 1)
- expectedOffsetShape.pop_back();
-
- if (expectedOffsetShape != offsetShape)
- return emitOpError(
- "Offsets should match TensorDesc except the chunk size dim.");
-
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_DpasOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index ef6a494b76638..7fc5d2fffae51 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -136,8 +136,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
- if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
- xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
+ if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::LoadMatrixOp>(
+ op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
xegpu::StoreMatrixOp>(op))
@@ -145,13 +145,8 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
if (isa<xegpu::StoreNdOp>(op))
return getTileShape(op->getOpOperand(1));
- // Handle LoadGatherOp and StoreScatterOp (with and without offset)
- if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
- if (loadGatherOp.getOffsets())
- return getTileShape(loadGatherOp->getOpResult(0));
- else
- return getTileShape(loadGatherOp->getOpOperand(0));
- }
+ if (isa<xegpu::LoadGatherOp>(op))
+ return getTileShape(op->getOpResult(0));
if (auto convertLayoutOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
auto inputInstData =
@@ -165,10 +160,8 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
return targetInstData;
}
- if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
- return getTileShape(storeScatterOp.getOffsets()
- ? storeScatterOp->getOpOperand(0)
- : storeScatterOp->getOpOperand(1));
+ if (isa<xegpu::StoreScatterOp>(op))
+ return getTileShape(op->getOpOperand(0));
if (isa<xegpu::DpasOp>(op)) {
std::optional<SmallVector<int64_t>> aTile =
@@ -340,23 +333,6 @@ void XeGPUBlockingPass::runOnOperation() {
if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {
Attribute encoding = tdescTy.getEncoding();
- // If the encoding is a ScatterTensorDescAttr, we need to
- // potentially adjust the chunk size based on the inst_data.
- if (tdescTy.isScattered()) {
- int64_t chunkSize = tdescTy.getChunkSizeAsInt();
-
- if (chunkSize > 1) {
- int64_t blockedChunkSize = chunkSize;
- auto instData = tdescTy.getLayoutAttr().getEffectiveInstDataAsInt();
- if (!instData.empty())
- blockedChunkSize = instData.back();
-
- // To create a new attribute with a
diff erent chunk_size:
- auto newEncoding = xegpu::ScatterTensorDescAttr::get(
- ctx, tdescTy.getMemorySpace(), blockedChunkSize);
- encoding = newEncoding;
- }
- }
newTy =
xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index ff9ff4937c293..686cb20e1976e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -278,15 +278,6 @@ static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1}));
}
-static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
- unsigned rank, int subgroupSize) {
- assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
- if (rank == 1) {
- return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
- }
- return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
-}
-
/// Helper to get the default layout for 2D block operations.
template <typename Ty>
static LayoutInfo getSIMTLayoutInfoBlockIO(Ty ty,
@@ -349,10 +340,6 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
- void visitCreateDescOp(xegpu::CreateDescOp createDesc,
- ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results);
-
void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
@@ -451,9 +438,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](xegpu::LoadGatherOp loadGatherOp) {
visitLoadGatherOp(loadGatherOp, operands, results);
})
- .Case([&](xegpu::CreateDescOp createDescOp) {
- visitCreateDescOp(createDescOp, operands, results);
- })
.Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) {
visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
})
@@ -1075,28 +1059,9 @@ void LayoutInfoPropagation::visitLoadGatherOp(
// Propagate the new layout to the tensor descriptor operand.
if (isa<xegpu::TensorDescType>(load.getSourceType()))
propagateIfChanged(operands[0], operands[0]->meet(loadLayoutInfo));
- // Propagate the new layout to the mask and optional offset operand.
+ // Propagate the new layout to the offset and mask operands.
propagateIfChanged(operands[1], operands[1]->meet(maskLayoutInfo));
- if (load.getOffsets())
- propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
-}
-
-/// Propagate the layout of the descriptor to the vector offset operand in
-/// CreateDescOp.
-void LayoutInfoPropagation::visitCreateDescOp(
- xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo descLayout = results[0]->getValue();
- // Need the layout of the descriptor to propagate to the operands.
- if (!descLayout.isAssigned())
- return;
- const uArch *uArch = getUArch(getChipStr(createDesc).value_or(""));
- if (!uArch)
- return;
- // For offset operand propagate 1D default layout.
- LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
- uArch->getSubgroupSize());
- propagateIfChanged(operands[1], operands[1]->meet(layout));
+ propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
}
/// Set the layout for the value, tensor descriptor, offset and mask operands in
@@ -1136,10 +1101,9 @@ void LayoutInfoPropagation::visitStoreScatterOp(
// Propagate the destination (if tdesc) operand layout
if (isa<xegpu::TensorDescType>(storeScatter.getDestType()))
propagateIfChanged(operands[1], operands[1]->meet(srcLayoutInfo));
- // Propagate the new layout to the mask and optional offset operand.
+ // Propagate the new layout to the offset and mask operands.
propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
- if (storeScatter.getOffsets())
- propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo));
+ propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo));
}
void LayoutInfoPropagation::visitLoadMatrixOp(
@@ -1420,12 +1384,6 @@ ResolveLayoutConflicts::resolveTensorDescConsumer(OpOperand &operand) {
auto currTDescType = dyn_cast<xegpu::TensorDescType>(tdescValue.getType());
assert(anchorOp && currTDescType &&
"Expected anchor layout op and tensor descriptor consumer.");
- // TODO: Scattered tensor desc is not supported for now.
- if (currTDescType.isScattered()) {
- DBGS() << "Scattered tensor descriptor not supported: " << tdescValue
- << "\n";
- return failure();
- }
Attribute currLayout = currTDescType.getLayout();
Attribute expectedLayout = anchorOp.getAnchorLayout();
// A conflict exists in tensor descriptor operand if tensor descriptor's
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ca454e632a3ea..9459164e4d48b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -804,8 +804,8 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
if (!storeScatterOp)
return failure();
- auto offsets = storeScatterOp.getOffsets();
- if (!offsets || !isa<VectorType>(offsets.getType()))
+ Value offsets = storeScatterOp.getOffsets();
+ if (!isa<VectorType>(offsets.getType()))
return rewriter.notifyMatchFailure(
storeScatterOp, "Store op must have a vector of offsets argument");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
@@ -1109,12 +1109,12 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
auto loadGatherOp =
producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
- auto offsets = loadGatherOp.getOffsets();
- if (!offsets || !isa<VectorType>(offsets.getType()) ||
+ Value offsets = loadGatherOp.getOffsets();
+ if (!isa<VectorType>(offsets.getType()) ||
!isa<VectorType>(loadGatherOp.getMask().getType()))
return rewriter.notifyMatchFailure(
loadGatherOp,
- "Load op must have a vector arguments for offsets and mask");
+ "Load op must have vector arguments for offsets and mask");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
VectorType resultVecTy =
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 36b903c5b4303..51693da389a49 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -477,74 +477,6 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
}
};
-struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
- using UnrollPattern<xegpu::CreateDescOp>::UnrollPattern;
- LogicalResult matchAndRewrite(xegpu::CreateDescOp op,
- PatternRewriter &rewriter) const override {
- Location loc = op.getLoc();
- xegpu::TensorDescType tdescTy = op.getType();
- TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
- VectorType indiceVecTy = indiceVec.getType();
-
- if (!tdescTy.isScattered())
- return failure();
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
- return failure();
-
- SmallVector<int64_t> targetIndiceShape(*targetShape);
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
- // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1.
- if (originalChunkSize > 1)
- targetIndiceShape.pop_back();
-
- auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
- SmallVector<Type> convertedIndiceTypes =
- getUnrolledTypes(indiceVecTy, targetIndiceShape);
- SmallVector<Value> convertedIndiceVec =
- pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter);
-
- SmallVector<Value> newOps;
-
- // More indices is need when chunkSize > 1. Since a big load from one
- // address could be break into multiple small loads.
- if (originalChunkSize > 1) {
- int64_t blockedChunkSize = targetShape->back();
- int64_t numNewChunks = originalChunkSize / blockedChunkSize;
-
- for (auto [indice, indiceType] :
- llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
- for (int64_t i = 0; i < numNewChunks; ++i) {
- // Compute the offset
- Value inc = arith::ConstantIndexOp::create(rewriter, loc,
- i * blockedChunkSize);
- Value incVec =
- vector::BroadcastOp::create(rewriter, loc, indiceType, inc);
- Value offsetIndice =
- arith::AddIOp::create(rewriter, loc, indice, incVec);
-
- auto newOp = xegpu::CreateDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), offsetIndice);
-
- newOps.push_back(newOp);
- }
- }
- } else {
- for (auto indice : convertedIndiceVec) {
- auto newOp = xegpu::CreateDescOp::create(rewriter, loc, newTdescTy,
- op.getSource(), indice);
- newOps.push_back(newOp);
- }
- }
-
- Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
- rewriter.replaceOp(op, castOp);
-
- return success();
- }
-};
-
struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
using UnrollPattern<xegpu::LoadGatherOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadGatherOp op,
@@ -563,7 +495,7 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
return failure();
SmallVector<int64_t> targetMaskShape(*targetShape);
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
+ int originalChunkSize = op.getChunkSize().value_or(1);
VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
@@ -854,7 +786,7 @@ struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
return failure();
SmallVector<int64_t> targetMaskShape(*targetShape);
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
+ int originalChunkSize = op.getChunkSize().value_or(1);
VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
@@ -900,59 +832,6 @@ struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
}
};
-struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
- using UnrollPattern<xegpu::UpdateOffsetOp>::UnrollPattern;
- LogicalResult matchAndRewrite(xegpu::UpdateOffsetOp op,
- PatternRewriter &rewriter) const override {
- Location loc = op.getLoc();
- xegpu::TensorDescType tdescTy = op.getTensorDescType();
-
- if (!tdescTy.isScattered())
- return failure();
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
- return failure();
-
- SmallVector<Type> convertedTdescTypes =
- getUnrolledTypes(tdescTy, *targetShape);
- SmallVector<Value> convertedTdesc = pack(
- op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
-
- TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
- VectorType offsetVecTy = offsetVec.getType();
- SmallVector<Type> convertedOffsetTypes;
- SmallVector<Value> convertedOffsetVec;
- SmallVector<Value> newOps;
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
- if (originalChunkSize > 1) {
- auto targetOffsetShape = ArrayRef<int64_t>(*targetShape).drop_back();
- convertedOffsetTypes = getUnrolledTypes(offsetVecTy, targetOffsetShape);
-
- int64_t blockedChunkSize = targetShape->back();
- int64_t numNewChunks = originalChunkSize / blockedChunkSize;
- // the offset is reused across the chunk_size dimension
- for (auto offset : pack(offsetVec, convertedOffsetTypes,
- targetOffsetShape, loc, rewriter))
- convertedOffsetVec.append(numNewChunks, offset);
-
- } else {
- convertedOffsetTypes = getUnrolledTypes(offsetVecTy, *targetShape);
- convertedOffsetVec =
- pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
- }
-
- for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
- auto newOp =
- xegpu::UpdateOffsetOp::create(rewriter, loc, t.getType(), t, o);
- newOps.push_back(newOp);
- }
- Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
- rewriter.replaceOp(op, castOp);
- return success();
- }
-};
-
struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
@@ -1094,11 +973,11 @@ struct UnrollConvertLayoutOp : public UnrollPattern<xegpu::ConvertLayoutOp> {
void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
- patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
- UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
- UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
- UnrollPrefetchOp, UnrollUpdateOffsetOp, UnrollLoadMatrixOp,
- UnrollStoreMatrixOp, UnrollLoadGatherOpWithOffset,
- UnrollStoreScatterOpWithOffsets, UnrollConvertLayoutOp>(
- patterns.getContext(), options);
+ patterns
+ .add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
+ UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollLoadGatherOp,
+ UnrollStoreScatterOp, UnrollPrefetchOp, UnrollLoadMatrixOp,
+ UnrollStoreMatrixOp, UnrollLoadGatherOpWithOffset,
+ UnrollStoreScatterOpWithOffsets, UnrollConvertLayoutOp>(
+ patterns.getContext(), options);
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index d637b6828deab..1e867b9c34069 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -951,9 +951,6 @@ struct WgToSgLoadGatherOpWithOffset
matchAndRewrite(xegpu::LoadGatherOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- if (!op.getOffsets())
- return failure();
-
Location loc = op.getLoc();
VectorType resultType = dyn_cast<VectorType>(op.getResult().getType());
if (!resultType)
@@ -1005,9 +1002,6 @@ struct WgToSgStoreScatterOpWithOffset
matchAndRewrite(xegpu::StoreScatterOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- if (!op.getOffsets())
- return failure();
-
Location loc = op.getLoc();
VectorType valueType = dyn_cast<VectorType>(op.getValue().getType());
if (!valueType)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index e83f96bb294a9..9098eb7e4815b 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -55,18 +55,6 @@ mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
// e.g. for 1D layout, sgSize = laneLayout[0]
int64_t sgSize = llvm::product_of(laneLayout);
- // Case 1: regular loads/stores
- auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
- if (scatterAttr) {
- auto chunkSize = scatterAttr.getChunkSize().getInt();
- // Verify if the first dimension of the tensor descriptor shape is
- // distributable.
- assert(tdescShape[0] == laneLayout[0] &&
- "tensor descriptor shape is not distributable");
- return VectorType::get({chunkSize}, elementType);
- }
-
- // Case 2: block loads/stores
// Check if the tensor descriptor shape is distributable.
int64_t tensorSize = 1;
for (auto [tdescDim, laneDim, laneDataDim] :
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 82c7879c79d56..42b38c09e0765 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -79,17 +79,6 @@ func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
return
}
-// -----
-func.func @prefetch_nd_vc_2(%src: memref<24xf16>) {
- %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc}}
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
- : !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
- return
-}
-
// -----
func.func @load_nd_vc_1(%src: memref<8x16xf16>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -99,17 +88,6 @@ func.func @load_nd_vc_1(%src: memref<8x16xf16>) {
return
}
-// -----
-func.func @load_nd_vc_2(%src: memref<16xf16>) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc.}}
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
- : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>> -> vector<8x2xf16>
- return
-}
-
// -----
func.func @load_nd_vc_3(%src: memref<8x16xf16>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
@@ -189,18 +167,6 @@ func.func @store_nd_vc_1(%dst: memref<24x32xf16>) {
return
}
-// -----
-func.func @store_nd_vc_2(%dst: memref<16xf16>) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- %1 = arith.constant dense<1.0>: vector<8x2xf16>
- %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc}}
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>
- : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- return
-}
-
// -----
func.func @store_nd_vc_3(%dst: memref<24x32xf16>) {
%1 = arith.constant dense<1.0>: vector<2x24x32xf16>
@@ -245,140 +211,73 @@ func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
}
// -----
-func.func @update_nd_offset_1(%dst: memref<16xf16>) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc}}
- xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_1(%src: ui64) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- // expected-error at +1 {{Expects a scattered TensorDesc}}
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_2(%src: memref<?xf32>) {
+func.func @prefetch_vc_2(%src: memref<?xf32>) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
- // expected-error at +1 {{invalid chunk size}}
- -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 0>>
+ // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
+ xegpu.prefetch %src[%0] <{l1_hint = #xegpu.cache_hint<write_back>}> : memref<?xf32>, vector<4xindex>
return
}
// -----
-func.func @create_tdesc_vc_3(%src: memref<?xf32>) {
+func.func @load_gather_vc_2(%src: memref<?xf32>) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- // expected-error at +1 {{Memory space mismatch}}
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
+ %2 = xegpu.load %src[%0], %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @create_tdesc_vc_4(%src: memref<?xf32>) {
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
- // expected-error at +1 {{expected last dim of tensor to match chunk size}}
- -> !xegpu.tensor_desc<4x5xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4>>
+func.func @load_gather_vc_3(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<8xi1>
+ // expected-error at +1 {{Mask should match value except the chunk size dim}}
+ %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}>
+ : memref<?xf32>, vector<4xindex>, vector<8xi1> -> vector<4x2xf32>
return
}
// -----
-func.func @create_tdesc_vc_5(%src: memref<?xf16>) {
+func.func @load_gather_simt_1(%src: memref<?xf32>) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf16>, vector<4xindex>
- // expected-error at +1 {{last dim of tensor to be a multiple of 2}}
- -> !xegpu.tensor_desc<4x3xf16, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
- return
-}
-
-
-// -----
-func.func @prefetch_vc_1(%src: memref<24x32xf16>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- // expected-error at +1 {{Expects a scattered TensorDesc}}
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ // expected-error at +1 {{value elements must match chunk size}}
+ %2 = xegpu.load %src[%0], %1 <{chunk_size = 2}>
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<6xf32>
return
}
// -----
-func.func @prefetch_vc_2(%src: ui64) {
+func.func @store_scatter_vc_2(%dst: memref<?xf32>) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- return
-}
-
-// -----
-func.func @create_tdesc_layout_1(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- // expected-error at +1 {{expected layout rank to match tensor rank}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- return
-}
-
-// -----
-func.func @load_gather_simt_1(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<6xf32>
- return
-}
-
-// -----
-func.func @store_scatter_simt_1(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %val = arith.constant dense<2.9>: vector<6xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- return
-}
-
-// -----
-func.func @load_gather_vc_1(%src: memref<24x32xf16>) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
- // expected-error at +1 {{Expects a scattered TensorDesc}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
- : !xegpu.tensor_desc<4x2xf16>, vector<4xi1> -> vector<4x2xf16>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ %2 = arith.constant dense<2.9>: vector<4xf32>
+ // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
+ xegpu.store %2, %dst[%0], %1 <{l1_hint = #xegpu.cache_hint<streaming>}>
+ : vector<4xf32>, memref<?xf32>, vector<4xindex>, vector<4xi1>
return
}
// -----
-func.func @load_gather_vc_2(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<1>: vector<4xi1>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<write_back>}>
- : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- -> vector<4x2xf32>
+func.func @store_scatter_vc_3(%dst: memref<?xf32>) {
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %1 = arith.constant dense<1>: vector<8xi1>
+ %2 = arith.constant dense<2.9>: vector<4x2xf32>
+ // expected-error at +1 {{Mask should match value except the chunk size dim}}
+ xegpu.store %2, %dst[%0], %1 <{chunk_size = 2}>
+ : vector<4x2xf32>, memref<?xf32>, vector<4xindex>, vector<8xi1>
return
}
// -----
-func.func @load_gather_vc_3(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<1>: vector<8xi1>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Mask should match TensorDesc except the chunk size dim}}
- %2 = xegpu.load %1, %0
- : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<8xi1>
- -> vector<4x2xf32>
+func.func @store_scatter_simt_1(%dst: memref<?xf32>) {
+ %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %1 = arith.constant dense<1>: vector<4xi1>
+ %2 = arith.constant dense<2.9>: vector<6xf32>
+ // expected-error at +1 {{value elements must match chunk size}}
+ xegpu.store %2, %dst[%0], %1 <{chunk_size = 2}>
+ : vector<6xf32>, memref<?xf32>, vector<4xindex>, vector<4xi1>
return
}
@@ -390,16 +289,6 @@ func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) {
return
}
-// -----
-func.func @prefetch_offset_wi_2(%src: memref<16xf32>) {
- %offsets = arith.constant dense<[0]> : vector<1xindex>
- %1 = xegpu.create_tdesc %src, %offsets : memref<16xf32>, vector<1xindex>
- -> !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
- // expected-error at +1 {{offsets not allowed}}
- xegpu.prefetch %1[%offsets]: !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>>, vector<1xindex>
- return
-}
-
// -----
func.func @prefetch_offset_wi_3(%src: memref<16xf32>) {
// expected-error at +1 {{Expects offsets}}
@@ -476,22 +365,22 @@ func.func @store_scatter_offset_wi_3(%src: memref<16xf16>) {
}
// -----
-func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>) {
+func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32>) {
%val = arith.constant dense<2.9>: vector<1xf16>
%offsets = arith.constant dense<[0]> : vector<1xindex>
%mask = arith.constant dense<1>: vector<1xi1>
// expected-error at +1 {{offsets not allowed}}
xegpu.store %val, %src[%offsets], %mask
- : vector<1xf16>, !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1>
+ : vector<1xf16>, !xegpu.tensor_desc<1x1xf32>, vector<1xindex>, vector<1xi1>
return
}
// -----
-func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>) {
+func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16>) {
%mask = arith.constant dense<1>: vector<1xi1>
%offsets = arith.constant dense<[0]> : vector<1xindex>
// expected-error at +1 {{offsets not allowed}}
- %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1> -> vector<2xf16>
+ %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16>, vector<1xindex>, vector<1xi1> -> vector<2xf16>
return
}
@@ -521,43 +410,6 @@ func.func @load_gather_offset_wi_1(%src: memref<4x4xf32>) {
return
}
-// -----
-func.func @store_scatter_vc_1(%src: memref<24x32xf32>) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %1 = arith.constant dense<2.9>: vector<4x2xf32>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
- // expected-error at +1 {{Expects a scattered TensorDesc}}
- xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
- : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32>, vector<4xi1>
- return
-}
-
-// -----
-func.func @store_scatter_vc_2(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
- %0 = arith.constant dense<1>: vector<4xi1>
- %1 = arith.constant dense<2.9>: vector<4x2xf32>
- %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
- xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<streaming>}> : vector<4x2xf32>,
- !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- return
-}
-
-// -----
-func.func @store_scatter_vc_3(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
- %0 = arith.constant dense<1>: vector<8xi1>
- %1 = arith.constant dense<2.9>: vector<4x2xf32>
- %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Mask should match TensorDesc except the chunk size dim}}
- xegpu.store %1, %2, %0 : vector<4x2xf32>,
- !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<8xi1>
- return
-}
-
// -----
func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
// expected-error at +1 {{K-dimension mismatch}}
@@ -600,15 +452,6 @@ func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
return
}
-// -----
-func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
- %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- // expected-error at +1 {{failed to verify that all of {tensorDesc, value, result} have same shape}}
- xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32>
- return
-}
-
// -----
func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
@@ -657,26 +500,6 @@ func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
return
}
-// -----
-func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- // expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
- !xegpu.tensor_desc<16xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<lane_layout = [1, 8], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- // expected-error at +1 {{expected last dim of tensor to match chunk size}}
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 4>,
- #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2]>>
- return
-}
-
// -----
func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
// expected-error at +1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}}
@@ -686,112 +509,121 @@ func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout and lane_layout to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+func.func @layout_rank_mismatch_sg_lane(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected sg_layout and lane_layout to have the same rank}}
+ {layout = #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout and inst_data to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
+func.func @layout_rank_mismatch_sg_inst(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected sg_layout and inst_data to have the same rank}}
+ {layout = #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected inst_data and lane_layout to have the same rank}}
- #xegpu.layout<inst_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+func.func @layout_rank_mismatch_inst_lane(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected inst_data and lane_layout to have the same rank}}
+ {layout = #xegpu.layout<inst_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected lane_data and lane_layout to have the same rank}}
- #xegpu.layout<inst_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2, 1]>>
+func.func @layout_rank_mismatch_lane_data(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected lane_data and lane_layout to have the same rank}}
+ {layout = #xegpu.layout<inst_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2, 1]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_data and sg_layout to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
+func.func @layout_rank_mismatch_sg_data(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected sg_data and sg_layout to have the same rank}}
+ {layout = #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+func.func @layout_rank_mismatch_tensor(%src: memref<16x32xf32>) {
+ %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<16x32xf32> ->
// expected-error at +1 {{expected layout rank to match tensor rank}}
!xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
#xegpu.layout<sg_layout = [1], sg_data = [32], inst_data = [16]>>
return
}
// -----
-func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{sg_layout and sg_data must be used together}}
- #xegpu.layout<sg_layout = [2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+func.func @layout_sg_data_missing(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{sg_layout and sg_data must be used together}}
+ {layout = #xegpu.layout<sg_layout = [2, 1], lane_layout = [8, 1], lane_data = [1, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{lane_layout and lane_data must be used together}}
- #xegpu.layout<inst_data = [16, 2], lane_layout = [16, 1]>>
+func.func @layout_lane_data_missing(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{lane_layout and lane_data must be used together}}
+ {layout = #xegpu.layout<inst_data = [16, 2], lane_layout = [16, 1]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout/lane_layout being used with order}}
- #xegpu.layout<inst_data = [16, 2], order = [0, 1]>>
+func.func @layout_order_without_layout(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected sg_layout/lane_layout being used with order}}
+ {layout = #xegpu.layout<inst_data = [16, 2], order = [0, 1]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected order and sg_layout to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2], order = [0, 1, 2]>>
+func.func @layout_order_rank_mismatch_sg(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected order and sg_layout to have the same rank}}
+ {layout = #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2], order = [0, 1, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
// -----
-func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected order and lane_layout to have the same rank}}
- #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
+func.func @layout_order_rank_mismatch_lane(%src: memref<?xf32>) {
+ %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
+ %mask = arith.constant dense<1>: vector<4xi1>
+ %2 = xegpu.load %src[%offsets], %mask
+ // expected-error at below {{expected order and lane_layout to have the same rank}}
+ {layout = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>}
+ : memref<?xf32>, vector<4xindex>, vector<4xi1> -> vector<4xf32>
return
}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 520061925f92c..b32e297b60fc8 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -22,7 +22,6 @@ gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y :
gpu.return
}
-
// CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) {
// CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
@@ -30,7 +29,6 @@ gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) {
gpu.return
}
-
// CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) {
// CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -38,7 +36,6 @@ gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) {
gpu.return
}
-
// CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) {
// CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
@@ -46,7 +43,6 @@ gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) {
gpu.return
}
-
// CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) {
// CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
@@ -61,7 +57,6 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) {
gpu.return
}
-
// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>)
gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
@@ -296,7 +291,6 @@ gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
gpu.return
}
-
// CHECK: func @simt_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) {
// CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
@@ -390,133 +384,6 @@ gpu.func @update_nd_tdesc_2(%src: memref<8x24x32xf32>) {
gpu.return
}
-// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) {
-gpu.func @create_tdesc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @create_tdesc_1(%src: memref<?xf32, 3>) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @create_tdesc_2(%src: memref<?xf32>) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) {
-gpu.func @create_tdesc_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-// CHECK: gpu.func @create_tdesc_4(%[[arg0:.*]]: ui64) {
-gpu.func @create_tdesc_4(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex>
- %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<4x2xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<4x2xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) {
-gpu.func @simt_load(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) {
-gpu.func @simt_load_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<4x8xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<4x8xf16>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) {
-gpu.func @simt_load_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<8xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<8xf16>
- gpu.return
-}
-
// CHECK: gpu.func @simt_load_4(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) {
gpu.func @simt_load_4(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) {
// CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
@@ -545,19 +412,6 @@ gpu.func @simt_load_7(%arg0: memref<256xf16>, %arg1: index, %arg2: i1) {
gpu.return
}
-// CHECK: gpu.func @subgroup_load_4(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load_4(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex>
- %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<2x4xi1>
- %1 = arith.constant dense<1>: vector<2x4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<2x4xi1> -> vector<2x4x8xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<2x4xi1> -> vector<2x4x8xf16>
- gpu.return
-}
-
// CHECK: gpu.func @subgroup_load_offset_1(%arg0: memref<?xf16>) {
gpu.func @subgroup_load_offset_1(%src: memref<?xf16>) {
%offset = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
@@ -568,96 +422,6 @@ gpu.func @subgroup_load_offset_1(%src: memref<?xf16>) {
gpu.return
}
-// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32>
- %2 = arith.constant dense<2.9>: vector<4x2xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) {
-gpu.func @simt_store(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2xf32>
- %2 = arith.constant dense<2.9>: vector<2xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4x2xf16>
- %2 = arith.constant dense<2.9>: vector<4x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) {
-gpu.func @simt_store_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2xf16>
- %2 = arith.constant dense<2.9>: vector<2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4xf32>
- %2 = arith.constant dense<2.9>: vector<4xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) {
-gpu.func @simt_store_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
- %2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- gpu.return
-}
-
// CHECK: gpu.func @simt_store_4(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: vector<1xindex>, %[[arg3:.*]]: vector<1xi1>) {
gpu.func @simt_store_4(%arg0: vector<8xf16>, %arg1: memref<256xf16>, %arg2: vector<1xindex>, %arg3: vector<1xi1>) {
// CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
@@ -686,21 +450,6 @@ gpu.func @simt_store_7(%arg0: f16, %arg1: memref<256xf16>, %arg2: index, %arg3:
gpu.return
}
-// CHECK: gpu.func @subgroup_store_4(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store_4(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex>
- %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<2x4xi1>
- %1 = arith.constant dense<1>: vector<2x4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2x4xf32>
- %2 = arith.constant dense<2.9>: vector<2x4xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x4xf32>, !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>, vector<2x4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x4xf32>, !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>, vector<2x4xi1>
- gpu.return
-}
-
// CHECK: gpu.func @subgroup_store_offset_1(%arg0: memref<?xf16>) {
gpu.func @subgroup_store_offset_1(%dest: memref<?xf16>) {
%val = arith.constant dense<2.9>: vector<4x2xf16>
@@ -712,17 +461,6 @@ gpu.func @subgroup_store_offset_1(%dest: memref<?xf16>) {
gpu.return
}
-// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) {
-gpu.func @prefetch(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
// CHECK: gpu.func @prefetch_offset(%[[arg0:.*]]: ui64) {
gpu.func @prefetch_offset(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
@@ -732,19 +470,6 @@ gpu.func @prefetch_offset(%src: ui64) {
gpu.return
}
-// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) {
-gpu.func @create_update_tdesc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xindex>
- gpu.return
-}
-
// CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
// CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
@@ -766,17 +491,6 @@ gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>)
gpu.return
}
-// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
-gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
- //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
- xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
- gpu.return
-}
-
// CHECK: gpu.func @alloc_nbarrier({{.*}}) {
gpu.func @alloc_nbarrier() {
// CHECK: xegpu.alloc_nbarrier
@@ -834,7 +548,6 @@ gpu.func @create_mem_desc_with_stride() {
gpu.return
}
-
// CHECK-LABEL: gpu.func @create_mem_desc_from_2d_memref({{.*}}) {
gpu.func @create_mem_desc_from_2d_memref() {
//CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<16x64xf16, 3>
@@ -940,3 +653,4 @@ gpu.func @dpas_mx(%a : vector<8x16xf8E5M2>, %b: vector<16x16xf8E5M2>, %acc: vect
}
}
+
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 26936dab2fb38..c87dbf3ec2108 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -155,26 +155,6 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
}
}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @load_gather_1d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
-// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
-// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> :
-// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
-func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
- %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.load %0, %cst_0 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
- xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
- return
-}
-}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @store_scatter_with_chunksize(
@@ -191,20 +171,7 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
return
}
}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @store_scatter_1d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1>
-func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
- %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
- return
-}
-}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @scatter_ops_chunksize(
@@ -224,6 +191,7 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
return
}
}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @scatter_ops(
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 61b8046bd04e5..8b57b262ebddf 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -413,155 +413,6 @@ gpu.module @test_kernel {
}
}
-// -----
-
-gpu.module @test_kernel {
- // CHECK-LABEL: test_prefetch_load_store_update
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}}
- // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
- // CHECK-COUNT-2: xegpu.load {{.*}}
- // CHECK-COUNT-2: xegpu.store {{.*}}
-
- gpu.func @test_prefetch_load_store_update(%src: ui64) {
-
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-
- %delta = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
- 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 64,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 256
- ]> : vector<32xindex>
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>} : vector<32xi1>
-
- %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
-
- %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout<inst_data = [16]>} : vector<32xf32>
- xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}:
- vector<32xf32>,
- !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>,
- vector<32xi1>
-
- gpu.return
- }
-
-}
-
-// -----
-gpu.module @test_kernel {
- // CHECK-LABEL: test_prefetch_load_store_update_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
- // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
- // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
-
- gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) {
-
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16, 2]>}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-
- %delta = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
- 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 64,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 256
- ]> : vector<32xindex>
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>} : vector<32xi1>
-
- %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
-
- %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout<inst_data = [16, 2]>} : vector<32x4xf32>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>:
- vector<32x4xf32>,
- !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>,
- vector<32xi1>
-
- gpu.return
- }
-}
-
-// -----
-#l = #xegpu.layout<inst_data = [2, 8, 2]>
-
-// test the blocking pass on a 3D scattered tensor descriptor,
-// Ops working 4x8x4xf32 scattered tensor_descs will be unrolled
-// into 4 ops working 2x8x2xf32 scattered tensor_descs based on
-// the given layout.
-gpu.module @test_kernel {
- // CHECK-LABEL: test_3d_scattered_tensor_desc
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK: [[cst_1:%.+]] = arith.constant dense<{{.*}}[130, 138, 146, 154, 162, 170, 178, 186], [194, 202, 210, 218, 226, 234, 242, 250]]> : vector<2x8xindex>
- // CHECK: [[cst_2:%.+]] = arith.constant dense<{{.*}}[2, 10, 18, 26, 34, 42, 50, 58], [66, 74, 82, 90, 98, 106, 114, 122]]> : vector<2x8xindex>
- // CHECK: [[cst_3:%.+]] = arith.constant dense<{{.*}}[0, 8, 16, 24, 32, 40, 48, 56], [64, 72, 80, 88, 96, 104, 112, 120]]> : vector<2x8xindex>
- // CHECK: [[cst_4:%.+]] = arith.constant dense<{{.*}}[128, 136, 144, 152, 160, 168, 176, 184], [192, 200, 208, 216, 224, 232, 240, 248]]> : vector<2x8xindex>
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<2x8xindex> -> !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<2x8xindex>
- // CHECK-COUNT-4: xegpu.load {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<2x8xi1> -> vector<2x8x2xf32>
- // CHECK-COUNT-4: xegpu.store {{.*}} : vector<2x8x2xf32>, !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<2x8xi1>
-
-
- gpu.func @test_3d_scattered_tensor_desc(%src: ui64) {
- %cst = arith.constant {layout_result_0 = #l} dense<[
- [0, 8, 16, 24, 32, 40, 48, 56],
- [64, 72, 80, 88, 96, 104, 112, 120],
- [128, 136, 144, 152, 160, 168, 176, 184],
- [192, 200, 208, 216, 224, 232, 240, 248]
- ]> : vector<4x8xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
- xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
-
- %delta = arith.constant {layout_result_0 = #l} dense<[
- [32, 32, 32, 32, 32, 32, 32, 32],
- [32, 32, 32, 32, 32, 32, 32, 64],
- [128, 128, 128, 128, 128, 128, 128, 128],
- [128, 128, 128, 128, 128, 128, 128, 256]
- ]> : vector<4x8xindex>
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xindex>
-
- %c4 = arith.constant 4: index
- %mask = vector.create_mask %c4, %c4 {layout_result_0 = #l}: vector<4x8xi1>
-
- %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
-
- %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>:
- vector<4x8x4xf32>,
- !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>,
- vector<4x8xi1>
- gpu.return
- }
-}
-
// -----
#a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
#b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
@@ -605,13 +456,13 @@ gpu.module @test_kernel {
#a = #xegpu.layout<inst_data = [8, 16]>
gpu.module @test_kernel {
//CHECK-LABEL: gpu.func @convert_layout_scalar
+ // CHECK-NOT: xegpu.convert_layout
gpu.func @convert_layout_scalar(%arg0: memref<16x16xf16>, %arg1: memref<4xf16>) {
%acc = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%a_tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a>
%a = xegpu.load_nd %a_tdesc {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16>
%a_reduce = vector.multi_reduction <add>, %a, %acc {layout_operand_0 = #a, layout_result_0 = #xegpu.slice<#a, dims = [0, 1]>} [0, 1] : vector<16x16xf16> to f16
- // CHECK-NOT: xegpu.convert_layout
%13 = xegpu.convert_layout %a_reduce <{input_layout = #xegpu.slice<#a, dims = [0, 1]>, target_layout = #xegpu.slice<#a, dims = [0, 1]>}> : f16
memref.store %13, %arg1[%c0] : memref<4xf16>
gpu.return
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index c3be138fef38a..750007077164f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -161,58 +161,12 @@ gpu.module @test {
//-----
- // CHECK-LABEL: create_tdesc_vec
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.func @create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- }
-
//-----
- // CHECK-LABEL: create_tdesc_step
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.func @create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
- %step = arith.constant dense<8> : vector<32xindex>
- %seq = vector.step : vector<32xindex>
- %cst = arith.muli %seq, %step : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- }
-
//-----
- // CHECK-LABEL: load
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.load {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
- gpu.func @load(%src: ui64) -> vector<32xf32> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- %ld = xegpu.load %tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
-
- gpu.return %ld : vector<32xf32>
- }
-
//-----
-
// CHECK-LABEL: load_with_offsets
// CHECK-SAME: [[arg0:%.+]]: ui64
// CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
@@ -233,48 +187,7 @@ gpu.module @test {
//-----
- // CHECK-LABEL: prefetch
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.func @prefetch(%src: ui64) {
-
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- gpu.return
- }
-
//-----
-
- // CHECK-LABEL: store
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.store {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
- gpu.func @store(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %st_vec = arith.constant dense<1023.0>: vector<32xf32>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- xegpu.store %st_vec, %tdesc, %mask: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1>
-
- gpu.return
- }
//-----
@@ -299,68 +212,10 @@ gpu.module @test {
}
//-----
- // CHECK-LABEL: create_tdesc_step_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4 : i64>>
- gpu.func @create_tdesc_step_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>> {
- %step = arith.constant dense<8> : vector<32xindex>
- %seq = vector.step : vector<32xindex>
- %cst = arith.muli %seq, %step : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
- }
//-----
- // CHECK-LABEL: create_tdesc_step_chunk2
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- gpu.func @create_tdesc_step_chunk2(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
- %step = arith.constant dense<8> : vector<32xindex>
- %seq = vector.step : vector<32xindex>
- %cst = arith.muli %seq, %step : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- }
-
-// CHECK-LABEL: create_tdesc_step_chunk3
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- gpu.func @create_tdesc_step_chunk3(%src: ui64) -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>> {
- %step = arith.constant dense<8> : vector<16xindex>
- %seq = vector.step : vector<16xindex>
- %cst = arith.muli %seq, %step : vector<16xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
- gpu.return %tdesc : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
- }
//-----
- // CHECK-LABEL: load_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
-
- gpu.func @load_chunk(%src: ui64) -> vector<32x4xf32> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- %ld = xegpu.load %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
-
- gpu.return %ld : vector<32x4xf32>
- }
//-----
// CHECK-LABEL: load_with_offsets_chunk
@@ -386,27 +241,6 @@ gpu.module @test {
}
//-----
- // CHECK-LABEL: store_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
- gpu.func @store_chunk(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32x4xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16,2]>>, vector<32xi1>
-
- gpu.return
- }
//-----
// CHECK-LABEL: store_with_offsets_chunk
@@ -434,42 +268,7 @@ gpu.module @test {
}
//-----
- // CHECK-LABEL: prefetch_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- gpu.func @prefetch_chunk(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-
- gpu.return
- }
//-----
- // CHECK-LABEL: update_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
- gpu.func @update_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
- %delta = arith.constant dense<32>: vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
-
- gpu.return %new_tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- }
}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 4760016bdcea4..3394d63dcbbdc 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -61,8 +61,8 @@ struct TestXeGPUUnrollingPatterns
-> std::optional<SmallVector<int64_t>> {
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
- xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
- xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
+ xegpu::PrefetchOp, xegpu::LoadGatherOp, xegpu::StoreScatterOp>(
+ op)) {
xegpu::TensorDescType tdescTy;
if (auto createNdOp = dyn_cast<xegpu::CreateNdDescOp>(op)) {
tdescTy = createNdOp.getType();
@@ -74,10 +74,6 @@ struct TestXeGPUUnrollingPatterns
tdescTy = loadNdOp.getTensorDescType();
} else if (auto storeNdOp = dyn_cast<xegpu::StoreNdOp>(op)) {
tdescTy = storeNdOp.getTensorDescType();
- } else if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
- tdescTy = createOp.getType();
- } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
- tdescTy = updateOp.getTensorDescType();
} else if (auto prefetchOp = dyn_cast<xegpu::PrefetchOp>(op)) {
tdescTy = prefetchOp.getTensorDescType();
} else if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
@@ -130,24 +126,6 @@ struct TestXeGPUUnrollingPatterns
Attribute encoding = tdescTy.getEncoding();
auto layout = tdescTy.getLayoutAttr();
- // If the encoding is a ScatterTensorDescAttr, we need to
- // potentially adjust the chunk size based on the inst_data.
- if (tdescTy.isScattered()) {
- int64_t chunkSize = tdescTy.getChunkSizeAsInt();
-
- if (chunkSize > 1) {
- int64_t blockedChunkSize = chunkSize;
- auto instData = layout.getEffectiveInstDataAsInt();
- if (!instData.empty())
- blockedChunkSize = instData.back();
-
- // To create a new attribute with a
diff erent chunk_size:
- auto newEncoding = xegpu::ScatterTensorDescAttr::get(
- ctx, tdescTy.getMemorySpace(), blockedChunkSize);
-
- encoding = newEncoding;
- }
- }
if (layout) {
if (layout.getEffectiveLaneLayoutAsInt().empty())
layout = xegpu::LayoutAttr();
More information about the Mlir-commits
mailing list