[Mlir-commits] [mlir] [MLIR][XeGPU] Clean up deprecated XeGPU ops & passes (PR #180757)
Nishant Patel
llvmlistbot at llvm.org
Tue Feb 10 07:40:02 PST 2026
https://github.com/nbpatel created https://github.com/llvm/llvm-project/pull/180757
None
>From a24088821a323c58e388e8def45bce0667c003d4 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 2 Feb 2026 23:02:47 +0000
Subject: [PATCH 1/4] Clean up XeGPU ops
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 142 +----
.../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 4 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 131 +----
.../XeGPU/TransformOps/XeGPUTransformOps.cpp | 11 +-
.../XeGPU/Transforms/XeGPUBlocking.cpp | 4 +-
.../XeGPU/Transforms/XeGPUFoldAliasOps.cpp | 8 +-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 23 -
.../Transforms/XeGPUSubgroupDistribute.cpp | 5 +-
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 96 +---
.../Transforms/XeGPUWgToSgDistribute.cpp | 158 +-----
mlir/test/Dialect/XeGPU/invalid.mlir | 410 ++-------------
mlir/test/Dialect/XeGPU/layout.mlir | 16 +-
mlir/test/Dialect/XeGPU/ops.mlir | 495 ++++--------------
.../XeGPU/propagate-layout-inst-data.mlir | 68 +--
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 325 +++++-------
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 295 +++--------
.../Dialect/XeGPU/xegpu-fold-alias-ops.mlir | 14 +-
...xegpu-unroll-patterns-no-desc-offsets.mlir | 3 +-
.../Dialect/XeGPU/xegpu-unroll-patterns.mlir | 379 ++------------
.../Dialect/XeGPU/xegpu-vector-linearize.mlir | 25 +-
.../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 60 +--
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 231 --------
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 8 +-
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 387 --------------
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 8 +-
25 files changed, 510 insertions(+), 2796 deletions(-)
delete mode 100644 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
delete mode 100644 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 2cbec50772b98..26c1ffb936942 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -100,41 +100,34 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
```mlir
%0 = memref.alloc() : memref<1024x1024xf32>
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %0: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
```
Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
```mlir
%0 = memref.alloc(%h, %w) : memref<?x?xf32>
- %c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
- %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %0, [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
```
Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
```mlir
%0 = ... : ui64
- %c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
- %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %0, [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
```
}];
let arguments = (ins
XeGPU_BaseAddrType: $source,
- Variadic<Index>: $offsets,
Variadic<Index>: $shape,
Variadic<Index>: $strides,
- OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
OptionalAttr<DenseI64ArrayAttr>: $const_shape,
OptionalAttr<DenseI64ArrayAttr>: $const_strides
);
let assemblyFormat = [{
$source ``
- custom<OptionalDynamicIndexList>($offsets, $const_offsets)
(`,` `shape` `:` custom<DynamicIndexList>($shape, $const_shape)^
`,` `strides``:` custom<DynamicIndexList>($strides, $const_strides))?
attr-dict `:` type($source) `->` qualified(type($TensorDesc))
@@ -148,14 +141,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source)>,
OpBuilder<(ins "Type": $tdesc, "Value ": $source,
- "llvm::ArrayRef<OpFoldResult>": $shape,
- "llvm::ArrayRef<OpFoldResult>": $strides)>,
-
- OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
- "llvm::ArrayRef<OpFoldResult>": $offsets)>,
-
- OpBuilder<(ins "Type": $tdesc, "Value": $source,
- "llvm::ArrayRef<OpFoldResult>": $offsets,
"llvm::ArrayRef<OpFoldResult>": $shape,
"llvm::ArrayRef<OpFoldResult>": $strides)>
];
@@ -181,14 +166,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
return getType().getShape();
}
- SmallVector<OpFoldResult> getMixedOffsets() {
- auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
- auto dynamics = getOffsets();
- if (statics.size() == 0 && dynamics.size() == 0)
- return {};
- return getMixedValues(statics, dynamics, getContext());
- }
-
SmallVector<OpFoldResult> getMixedSizes() {
SmallVector<int64_t> statics;
@@ -215,7 +192,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
return getMixedValues(statics, getStrides(), getContext());
}
- /// Return the number of leading operands before the `offsets`,
+ /// Return the number of leading operands before the
/// `shape` and `strides` operands.
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
@@ -648,107 +625,6 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
let hasVerifier = 1;
}
-def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
- let summary = "create scattered tensor descriptors (TensorDesc).";
- let description = [{
- "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
- a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
- is for creating continuous subviews, "create_tdesc" is for creating non-continuous
- (scattered) subviews, allowing each lane in a subgroup specifying their own offset.
- It accepts the following parameters:
-
- Arguments:
-
- - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
- memory object.
-
- - `offsets`: a vector containing offsets of each access point. Its size
- is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
- implying each element in the vector corresponds to a SIMT lane in the subgroup.
-
- Results:
- - `res`: scattered tensor descriptor
-
- The first dimension of the result TensorDesc corresponds to lanes, so it should
- match the dimension of offsets. It may also has a second dimension corresponding to
- the chunk_size if the chunk size is larger than 1.
-
- Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
- ```mlir
- %a = memref.alloc() : memref<1024xf32>
- %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
- %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32>
- ```
-
- Example 2: It assumes subgroup size is 4, and each workitem access 8 elements.
- It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
- ```mlir
- %0 = memref.alloc() : memref<1024xf32>
- %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
- ```
-
- Example 3: It is similar to Example 2, but there is some overlaps among workitems.
- It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
- ```mlir
- %0 = memref.alloc() : memref<1024xf32>
- %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex>
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
- ```
- }];
-
- let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source,
- XeGPU_OffsetType:$offsets);
- let results = (outs XeGPU_TensorDesc:$TensorDesc);
-
- let builders = [
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
- "llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
- "llvm::ArrayRef<int64_t>": $offsets)>,
- ];
-
- let assemblyFormat = [{
- $source `,` $offsets attr-dict `:` type($source) `,` type($offsets) `->` qualified(type($TensorDesc))
- }];
-
- let extraClassDeclaration = [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
-
- mlir::VectorType getOffsetsType() {
- return getOffsets().getType();
- }
-
- size_t getNumOffsets() {
- return getOffsetsType().getNumElements();
- }
-
- mlir::Value getViewSource() { return getSource(); }
-
- unsigned getSourceMemorySpace() {
- auto srcTy = getSource().getType();
- if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
- auto attr = memrefTy.getMemorySpace();
- if (attr) {
- if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
- return static_cast<unsigned>(intAttr.getInt());
- if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
- return static_cast<unsigned>(memSpaceAttr.getValue());
- }
- }
- // take global as default memory scope.
- return static_cast<unsigned>(MemorySpace::Global);
- }
-
- }];
-
- let hasVerifier = 1;
-}
-
def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
let summary = "prefetches a set of scattered data points to cache";
@@ -765,7 +641,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
- `source`: represents the memory region to be loaded from, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
+ In case of tensor_desc, offsets are encoded in the descriptor.
tensor_desc cannot be used at lane level.
- `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
@@ -794,7 +670,6 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
The source operand could be a raw pointer (ui64, ui32, i64, i32).
- Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
%0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
@@ -897,7 +772,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
- `source`: represents the memory region to be loaded from, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
+ In case of tensor_desc, offsets are encoded in the descriptor.
tensor_desc cannot be used at lane level.
- `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
@@ -942,7 +817,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
Example 3 (Subgroup level):
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
- The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc
+ The source operand could be a raw pointer (ui64, ui32, i64, i32).
for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
@@ -1068,7 +943,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
- `dest`: represents the memory region to be stored to, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
+ In case of tensor_desc, offsets are encoded in the descriptor.
tensor_desc cannot be used at lane level.
- `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
@@ -1109,7 +984,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
The dest operand could be a raw pointer (uint64_t).
- Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
%val = arith.constant dense<0.0> : vector<16xf32>
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 8a06271eadd84..efdc0df199ca7 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -177,9 +177,7 @@ class CreateNdDescToXeVMPattern
matchAndRewrite(xegpu::CreateNdDescOp op,
xegpu::CreateNdDescOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
- if (mixedOffsets.size() != 0)
- return rewriter.notifyMatchFailure(op, "Offsets not supported.");
+ // CreateNdDescOp no longer supports offsets (version 1 removed)
auto loc = op.getLoc();
auto source = op.getSource();
// Op is lowered to a code sequence that populates payload.
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 91ba07a8e0256..ad085f2c9fe9e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -247,10 +247,8 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
[[maybe_unused]] auto ty = source.getType();
assert(ty.hasStaticShape() && "expecting a memref with static shape");
- build(builder, state, tdesc, source, ValueRange({}) /* dynamic offsets */,
- ValueRange({}) /* empty dynamic shape */,
+ build(builder, state, tdesc, source, ValueRange({}) /* empty dynamic shape */,
ValueRange({}) /* empty dynamic strides */,
- DenseI64ArrayAttr({}) /* const offsets */,
DenseI64ArrayAttr({}) /* empty const shape*/,
DenseI64ArrayAttr({}) /* empty const strides*/);
}
@@ -289,74 +287,11 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
}
}
- build(builder, state, tdesc, source, ValueRange({}), dynamicShape,
- dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr,
+ build(builder, state, tdesc, source, dynamicShape,
+ dynamicStrides, staticShapeAttr,
staticStridesAttr);
}
-void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
- Type tdesc, TypedValue<MemRefType> source,
- llvm::ArrayRef<OpFoldResult> offsets) {
- [[maybe_unused]] auto ty = source.getType();
- assert(ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank());
-
- llvm::SmallVector<int64_t> staticOffsets;
- llvm::SmallVector<Value> dynamicOffsets;
- dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-
- build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
- ValueRange({}) /* empty dynamic shape */,
- ValueRange({}) /* empty dynamic strides */,
- builder.getDenseI64ArrayAttr(staticOffsets) /* const offsets */,
- {} /* empty const shape*/, {} /* empty const strides*/);
-}
-
-void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
- Type tdesc, Value source,
- llvm::ArrayRef<OpFoldResult> offsets,
- llvm::ArrayRef<OpFoldResult> shape,
- llvm::ArrayRef<OpFoldResult> strides) {
- assert(!shape.empty() && !offsets.empty() && !strides.empty() &&
- shape.size() == strides.size() && shape.size() == offsets.size());
-
- Type srcTy = source.getType();
- assert((isa<IntegerType, MemRefType>(srcTy)) &&
- "Source has to be either int or memref.");
-
- llvm::SmallVector<Value> dynamicOffsets;
- llvm::SmallVector<Value> dynamicShape;
- llvm::SmallVector<Value> dynamicStrides;
-
- llvm::SmallVector<int64_t> staticOffsets;
- llvm::SmallVector<int64_t> staticShape;
- llvm::SmallVector<int64_t> staticStrides;
-
- dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
- dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
- dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
-
- auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
- auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
- auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
-
- if (auto memrefTy = dyn_cast<MemRefType>(srcTy)) {
- auto memrefShape = memrefTy.getShape();
- auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
-
- // if shape and strides are from Memref, we don't need attributes for them
- // to keep the IR print clean (only do so for full-static case, otherwise
- // printer would fail trying to print empty array-attr).
- if (staticShape == memrefShape && staticStrides == memrefStrides &&
- dynamicShape.empty() && dynamicStrides.empty()) {
- staticShapeAttr = DenseI64ArrayAttr();
- staticStridesAttr = DenseI64ArrayAttr();
- }
- }
-
- build(builder, state, tdesc, source, dynamicOffsets, dynamicShape,
- dynamicStrides, staticOffsetsAttr, staticShapeAttr, staticStridesAttr);
-}
-
LogicalResult CreateNdDescOp::verify() {
size_t rank = getMixedSizes().size();
bool invalidRank = rank != getMixedStrides().size();
@@ -373,9 +308,6 @@ LogicalResult CreateNdDescOp::verify() {
<< " Source: " << srcMemorySpace
<< ", TensorDesc: " << tdescMemorySpace;
- if (size_t offsetRank = getMixedOffsets().size())
- invalidRank |= (offsetRank != rank);
-
// check source type matches the rank if it is a memref.
// It also should have the same ElementType as TensorDesc.
if (auto memrefTy = dyn_cast<MemRefType>(getSourceType()))
@@ -390,14 +322,14 @@ LogicalResult CreateNdDescOp::verify() {
if (invalidRank)
return emitOpError(
- "Expecting the rank of shape, strides, offsets, and source (if source "
+ "Expecting the rank of shape, strides, and source (if source "
"is a memref) should match with each other.");
// check result TensorDesc rank
if (getType().getRank() > (int64_t)rank)
return emitOpError(
"Expecting the TensorDesc rank is not greater than the "
- "ranks of shape, strides, offsets or the memref source.");
+ "ranks of shape, strides or the memref source.");
if (invalidElemTy)
return emitOpError("TensorDesc should have the same element "
@@ -739,59 +671,6 @@ LogicalResult UpdateNdOffsetOp::verify() {
return success();
}
-//===----------------------------------------------------------------------===//
-// XeGPU_CreateDescOp
-//===----------------------------------------------------------------------===//
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
- TensorDescType TensorDesc, Value source,
- llvm::ArrayRef<OpFoldResult> offsets) {
- auto loc = source.getLoc();
- int64_t size = static_cast<int64_t>(offsets.size());
- auto type = VectorType::get(size, builder.getIndexType());
- auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
- auto offset = vector::FromElementsOp::create(builder, loc, type, values);
- build(builder, state, TensorDesc, source, offset);
-}
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
- TensorDescType TensorDesc, Value source,
- llvm::ArrayRef<int64_t> offsets) {
- auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
- build(builder, state, TensorDesc, source, ofrs);
-}
-
-LogicalResult CreateDescOp::verify() {
- auto tdescTy = getTensorDescType();
-
- if (!tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.\n");
-
- // Memory space of created TensorDesc should match with the source.
- // Both source and TensorDesc are considered for global memory by default,
- // if the memory scope attr is not specified. If source is an integer,
- // it is considered as ptr to global memory.
- auto srcMemorySpace = getSourceMemorySpace();
- auto tdescMemorySpace = static_cast<unsigned>(tdescTy.getMemorySpace());
- if (srcMemorySpace != tdescMemorySpace)
- return emitOpError("Memory space mismatch.")
- << " Source: " << srcMemorySpace
- << ", TensorDesc: " << tdescMemorySpace;
-
- // check total size
- auto chunkSize = tdescTy.getChunkSizeAsInt();
- SmallVector<int64_t> shape(getOffsetsType().getShape());
- if (chunkSize != 1)
- shape.push_back(chunkSize);
-
- auto tdescShape = getShapeOf(tdescTy);
- if (shape != tdescShape)
- return emitOpError("Incorrect TensorDesc shape. ")
- << "Expected is " << makeString(shape) << "\n";
-
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_PrefetchOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
index 285dfac56be08..7db208814c4ce 100644
--- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
@@ -170,8 +170,7 @@ static xegpu::CreateNdDescOp
setDescLayout(transform::TransformRewriter &rewriter,
xegpu::CreateNdDescOp descOp,
xegpu::DistributeLayoutAttr layout) {
- assert(descOp.getMixedOffsets().size() == 0 &&
- "create desc op with offsets is not supported");
+ // CreateNdDescOp no longer supports offsets
auto oldTensorDesc = descOp.getType();
auto descType = xegpu::TensorDescType::get(
oldTensorDesc.getShape(), oldTensorDesc.getElementType(),
@@ -461,7 +460,7 @@ transform::InsertPrefetchOp::apply(transform::TransformRewriter &rewriter,
if (!maybeLoadOp)
return emitSilenceableFailure(getLoc()) << "Could not find load op.";
auto loadOp = *maybeLoadOp;
- if (loadOp.getMixedOffsets().size() == 0) {
+ if (loadOp.getMixedOffsets().empty()) {
auto diag = emitSilenceableFailure(getLoc())
<< "Load op must have offsets.";
diag.attachNote(loadOp.getLoc()) << "load op";
@@ -482,11 +481,7 @@ transform::InsertPrefetchOp::apply(transform::TransformRewriter &rewriter,
if (!maybeDescOp)
return emitSilenceableFailure(getLoc()) << "Could not find descriptor op.";
auto descOp = *maybeDescOp;
- if (descOp.getMixedOffsets().size() > 0) {
- auto diag = emitSilenceableFailure(getLoc())
- << "desc op with offsets is not supported.";
- diag.attachNote(descOp.getLoc()) << "desc op";
- }
+ // CreateNdDescOp no longer supports offsets
// Clone desc op outside the loop.
rewriter.setInsertionPoint(forOp);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index c00b7d42d48a6..bd30d40ff4d64 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -180,8 +180,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
- if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
- xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
+ if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
+ xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
xegpu::StoreMatrixOp>(op))
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp
index 3fccc45fecf40..0db45895b87b2 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp
@@ -28,6 +28,9 @@ using namespace mlir;
namespace {
/// Merges subview operation with xegpu.create_nd_tdesc operation.
+/// NOTE: This pattern is currently disabled because CreateNdDescOp no longer
+/// supports offsets. Offsets should be specified on load/store/prefetch ops.
+/*
class XegpuCreateNdDescOpSubViewOpFolder final
: public OpRewritePattern<xegpu::CreateNdDescOp> {
public:
@@ -59,9 +62,12 @@ LogicalResult XegpuCreateNdDescOpSubViewOpFolder::matchAndRewrite(
return success();
}
+*/
+} // namespace
void xegpu::populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns) {
- patterns.add<XegpuCreateNdDescOpSubViewOpFolder>(patterns.getContext());
+ // XegpuCreateNdDescOpSubViewOpFolder is disabled - CreateNdDescOp no longer supports offsets
+ // patterns.add<XegpuCreateNdDescOpSubViewOpFolder>(patterns.getContext());
}
namespace {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 96fdced39d9ab..50fb7b572e13d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -395,10 +395,6 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
- void visitCreateDescOp(xegpu::CreateDescOp createDesc,
- ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results);
-
void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
@@ -473,9 +469,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](xegpu::LoadGatherOp loadGatherOp) {
visitLoadGatherOp(loadGatherOp, operands, results);
})
- .Case([&](xegpu::CreateDescOp createDescOp) {
- visitCreateDescOp(createDescOp, operands, results);
- })
.Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) {
visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
})
@@ -1231,22 +1224,6 @@ void LayoutInfoPropagation::visitLoadGatherOp(
propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
}
-/// Propagate the layout of the descriptor to the vector offset operand in
-/// CreateDescOp.
-void LayoutInfoPropagation::visitCreateDescOp(
- xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo descLayout = results[0]->getValue();
- // Need the layout of the descriptor to propagate to the operands.
- if (!descLayout.isAssigned())
- return;
- auto uArch = getUArch(getChipStr(createDesc).value_or(""));
- // For offset operand propagate 1D default layout.
- LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
- uArch->getSubgroupSize());
- propagateIfChanged(operands[1], operands[1]->meet(layout));
-}
-
/// Set the layout for the value, tensor descriptor, offset and mask operands in
/// the StoreScatterOp.
void LayoutInfoPropagation::visitStoreScatterOp(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index a8ed5a289f84a..074bf74d871fc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -251,10 +251,7 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks layout attribute");
- // CreateNdOp must not have offsets.
- if (descOp.getMixedOffsets().size())
- return rewriter.notifyMatchFailure(
- descOp, "xegpu::CreateNdDescOp must not have offsets");
+ // CreateNdDescOp no longer supports offsets (version 1 removed)
SmallVector<size_t> newRetIndices;
rewriter.setInsertionPoint(warpOp);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 8f4e2bb0451d8..4f81f81cc351a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -176,22 +176,12 @@ struct UnrollCreateNdOp : public UnrollPattern<xegpu::CreateNdDescOp> {
SmallVector<Value> newOps;
auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
- bool hasOffsets = op.getMixedOffsets().size() != 0;
- if (!hasOffsets) {
- auto newOp = xegpu::CreateNdDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), op.getMixedSizes(),
- op.getMixedStrides());
- newOps.push_back(newOp);
- } else {
- auto createOp = [&](SmallVector<OpFoldResult> offsets) -> Value {
- return xegpu::CreateNdDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), offsets,
- op.getMixedSizes(), op.getMixedStrides());
- };
+ // CreateNdDescOp no longer supports offsets - create single descriptor
+ auto newOp = xegpu::CreateNdDescOp::create(
+ rewriter, loc, newTdescTy, op.getSource(), op.getMixedSizes(),
+ op.getMixedStrides());
+ newOps.push_back(newOp);
- newOps = computeUnrolledOffsets(op.getMixedOffsets(), tdescTy,
- *targetShape, createOp, loc, rewriter);
- }
Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
rewriter.replaceOp(op, castOp);
@@ -476,74 +466,6 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
}
};
-struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
- using UnrollPattern<xegpu::CreateDescOp>::UnrollPattern;
- LogicalResult matchAndRewrite(xegpu::CreateDescOp op,
- PatternRewriter &rewriter) const override {
- Location loc = op.getLoc();
- xegpu::TensorDescType tdescTy = op.getType();
- TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
- VectorType indiceVecTy = indiceVec.getType();
-
- if (!tdescTy.isScattered())
- return failure();
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
- return failure();
-
- SmallVector<int64_t> targetIndiceShape(*targetShape);
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
- // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1.
- if (originalChunkSize > 1)
- targetIndiceShape.pop_back();
-
- auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
- SmallVector<Type> convertedIndiceTypes =
- getUnrolledTypes(indiceVecTy, targetIndiceShape);
- SmallVector<Value> convertedIndiceVec =
- pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter);
-
- SmallVector<Value> newOps;
-
- // More indices is need when chunkSize > 1. Since a big load from one
- // address could be break into multiple small loads.
- if (originalChunkSize > 1) {
- int64_t blockedChunkSize = targetShape->back();
- int64_t numNewChunks = originalChunkSize / blockedChunkSize;
-
- for (auto [indice, indiceType] :
- llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
- for (int64_t i = 0; i < numNewChunks; ++i) {
- // Compute the offset
- Value inc = arith::ConstantIndexOp::create(rewriter, loc,
- i * blockedChunkSize);
- Value incVec =
- vector::BroadcastOp::create(rewriter, loc, indiceType, inc);
- Value offsetIndice =
- arith::AddIOp::create(rewriter, loc, indice, incVec);
-
- auto newOp = xegpu::CreateDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), offsetIndice);
-
- newOps.push_back(newOp);
- }
- }
- } else {
- for (auto indice : convertedIndiceVec) {
- auto newOp = xegpu::CreateDescOp::create(rewriter, loc, newTdescTy,
- op.getSource(), indice);
- newOps.push_back(newOp);
- }
- }
-
- Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
- rewriter.replaceOp(op, castOp);
-
- return success();
- }
-};
-
struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
using UnrollPattern<xegpu::LoadGatherOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadGatherOp op,
@@ -1037,9 +959,9 @@ void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
patterns
.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
- UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
- UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
- UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp,
- UnrollLoadGatherOpWithOffset, UnrollStoreScatterOpWithOffsets>(
+ UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollLoadGatherOp,
+ UnrollStoreScatterOp, UnrollPrefetchOp, UnrollUpdateOffsetOp,
+ UnrollLoadMatrixOp, UnrollStoreMatrixOp,
+ UnrollLoadGatherOpWithOffset, UnrollStoreScatterOpWithOffsets>(
patterns.getContext(), options);
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 45a002b63abd6..b87afd02f385c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -139,87 +139,8 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
return success();
}
-/// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
-/// from a workgroup descriptor. It replaces the offsets and sizes with
-/// appropriate values for the subgroup.
-/// It uses round-robin assignment to distribute the work to the subgroups.
-/// Following create_nd_desc operation:,
-/// %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x24xf32>
-/// -> !xegpu.tensor_desc<24x24xf32, #xegpu.layout<sg_layout = [4, 4],
-/// sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-/// is converted to 9 subgroup level operations based on the sg_layout &
-/// sg_data:
-/// %tdesc = xegpu.create_nd_tdesc %src[off1, off2] : memref<24x24xf32> ->
-/// !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2],
-/// lane_data = [1, 1]>>
-///
-/// The sg_layout and sg_data attributes are dropped after the pass as they are
-/// no longer needed.
-///
-/// 24x24 matrix distribution example:
-/// sg_layout = [4, 4], sg_data = [2, 2]
-/// Each 8x8 matrix within the 24x24 matrix is called a distribution unit.
-/// dist_unit_shape = [8, 8] --> sg_layout[i] * sg_data[i]
-///
-/// +------------------------+
-/// | 8x8 | 8x8 | 8x8 | <- 3 tiles across
-/// |-----+-----+-----|
-/// | 8x8 | 8x8 | 8x8 | <- 3 tiles down
-/// |-----+-----+-----|
-/// | 8x8 | 8x8 | 8x8 |
-/// +------------------------+
-///
-/// Each 8x8 tile is further subdivided among subgroups:
-/// +------------------------+
-/// | 2x2 2x2 2x2 2x2 | <- 4 subgroups across (each handles 2 columns)
-/// | 2x2 2x2 2x2 2x2 | <- 4 subgroups down (each handles 2 rows)
-/// | 2x2 2x2 2x2 2x2 |
-/// | 2x2 2x2 2x2 2x2 |
-/// +------------------------+
-///
-/// Since the 24x24 matrix is divided into 8x8 distribution units, there will be
-/// 9 distribution units (3x3) in total. Hence the 9 subgroup level operations.
-
-/// The pass currently has entire distribution logic in the WgToSgCreateNdOp
-/// pattern and all the other ops just follow.
-/// TODO: Decouple the distribution logic from WgToSgCreateNdOp for all the
-/// ops in the pass.
-struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
- using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
-
- LogicalResult
- matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
- SmallVector<SmallVector<OpFoldResult>> offsetsList;
- if (failed(genOffsetsList(rewriter, op, offsetsList)))
- return failure();
-
- MLIRContext *ctx = op.getContext();
- xegpu::TensorDescType tdescTy = op.getType();
- ArrayRef<int64_t> wgShape = tdescTy.getShape();
- Type elemTy = tdescTy.getElementType();
- xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
- SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
- auto newTdescTy =
- xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
- layout.dropSgLayoutAndData());
-
- SmallVector<Value> newOps;
- for (auto offsets : offsetsList) {
- auto newOp = xegpu::CreateNdDescOp::create(
- rewriter, op.getLoc(), newTdescTy, op.getSource(), offsets,
- op.getMixedSizes(), op.getMixedStrides());
-
- newOps.push_back(newOp);
- }
- rewriter.replaceOpWithMultiple(op, {newOps});
-
- return success();
- }
-};
-
-// This pattern transforms the CreateNdDescOp without offsets to create a
-// subgroup descriptor from a workgroup descriptor
+// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
+// from a workgroup descriptor without offsets
struct WgToSgCreateNdOpNoOffset
: public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
@@ -228,10 +149,6 @@ struct WgToSgCreateNdOpNoOffset
matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- // Check no offsets are specified.
- if (!op.getMixedOffsets().empty())
- return failure();
-
Location loc = op.getLoc();
MLIRContext *ctx = op.getContext();
xegpu::TensorDescType tdescTy = op.getType();
@@ -261,51 +178,6 @@ struct WgToSgCreateNdOpNoOffset
}
};
-/// This pattern transforms the LoadNdOp to load subgroup data.
-struct WgToSgLoadNdOp : public OpConversionPattern<xegpu::LoadNdOp> {
- using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
- LogicalResult
- matchAndRewrite(xegpu::LoadNdOp op, OneToNOpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
- if (!op.getMixedOffsets().empty())
- return failure();
-
- SmallVector<Value> newLoadOps;
- for (auto src : adaptor.getTensorDesc()) {
- xegpu::TensorDescType tdescTy =
- dyn_cast<xegpu::TensorDescType>(src.getType());
- ArrayRef<int64_t> srcShape = tdescTy.getShape();
- VectorType newResTy = VectorType::get(srcShape, tdescTy.getElementType());
- auto newLoadOp = xegpu::LoadNdOp::create(
- rewriter, op.getLoc(), newResTy, src,
- xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs()));
- newLoadOps.push_back(newLoadOp);
- }
- rewriter.replaceOpWithMultiple(op, {newLoadOps});
- return mlir::success();
- }
-};
-
-/// This pattern transforms the StoreNdOp to store to a subgroup descriptor
-/// It creates a StoreNdOp op to store the updated values to the new subgroup
-/// src tensor descriptors.
-struct WgToSgStoreNdOp : public OpConversionPattern<xegpu::StoreNdOp> {
- using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
- LogicalResult
- matchAndRewrite(xegpu::StoreNdOp op, OneToNOpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
- if (!op.getMixedOffsets().empty())
- return failure();
-
- for (auto [v, t] : llvm::zip(adaptor.getValue(), adaptor.getTensorDesc()))
- xegpu::StoreNdOp::create(rewriter, op.getLoc(), v, t, op.getL1HintAttr(),
- op.getL2HintAttr(), op.getL3HintAttr());
-
- rewriter.eraseOp(op);
- return success();
- }
-};
-
// This pattern transforms the LoadNdOp with explicit offsets to load
// subgroup data.
struct WgToSgLoadNdOpWithOffset : public OpConversionPattern<xegpu::LoadNdOp> {
@@ -462,26 +334,6 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
}
};
-/// This pattern transforms the PrefetchNdOp to prefetch the subgroup data.
-struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
- using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
- LogicalResult
- matchAndRewrite(xegpu::PrefetchNdOp op, OneToNOpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
-
- int64_t offsetSize = static_cast<int64_t>(op.getOffsets().size());
- if ((offsetSize != 0) || op.getConstOffsetsAttr())
- return failure();
-
- for (auto src : adaptor.getTensorDesc())
- xegpu::PrefetchNdOp::create(
- rewriter, op.getLoc(), TypeRange(), src,
- xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs()));
- rewriter.eraseOp(op);
- return success();
- }
-};
-
/// This pattern transforms vector.broadcast ops to work at subgroup level.
struct WgToSgVectorBroadcastOp
: public OpConversionPattern<vector::BroadcastOp> {
@@ -1699,9 +1551,9 @@ namespace mlir {
namespace xegpu {
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
patterns
- .add<WgToSgCreateNdOp, WgToSgCreateNdOpNoOffset, WgToSgLoadNdOp,
- WgToSgLoadNdOpWithOffset, WgToSgStoreNdOp, WgToSgStoreNdOpWithOffset,
- WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
+ .add<WgToSgCreateNdOpNoOffset,
+ WgToSgLoadNdOpWithOffset, WgToSgStoreNdOpWithOffset,
+ WgToSgUpdateNdOffsetOp, WgToSgDpasOp,
WgToSgPrefetchNdOpWithOffset, UnrealizedConversionCastOpPattern,
WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp,
WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index f2011ab86e9e9..52b6264481c08 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -2,8 +2,8 @@
// -----
func.func @create_nd_tdesc_1(%src: memref<24xf32>) {
- // expected-error at +1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides, offsets or the memref source}}
- %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // expected-error at +1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides or the memref source}}
+ %1 = xegpu.create_nd_tdesc %src : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
return
}
@@ -11,42 +11,42 @@ func.func @create_nd_tdesc_1(%src: memref<24xf32>) {
func.func @create_nd_tdesc_2(%src: memref<24x32xf32>) {
// expected-error at +1 {{TensorDesc should have the same element type with the source if it is a memref}}
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
return
}
// -----
func.func @create_nd_tdesc_3(%src: memref<2x24x32xf32, 3>) {
// expected-error at +1 {{SLM is only supported for 1D block tensor}}
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+ %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
return
}
// -----
func.func @create_nd_tdesc_4(%src: memref<2x24x32xf32, 3>) {
// expected-error at +1 {{Memory space mismatch}}
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
return
}
// -----
func.func @create_nd_tdesc_5(%src: memref<128x128xf32>) {
// expected-error at +1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>}}
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
return
}
// -----
func.func @create_nd_tdesc_6(%src: memref<128x128xf32>) {
// expected-error at +1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>}}
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
return
}
// -----
func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) {
// expected-error at +1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>}}
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
return
}
@@ -60,59 +60,37 @@ func.func @create_nd_tdesc_8(%src: ui64) {
// -----
func.func @create_nd_tdesc_9(%src: ui64) {
// expected-error at +1 {{expecting strides and shape to be present for integer source}}
- %1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32>
+ %1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32>
return
}
// -----
func.func @create_nd_tdesc_10(%src: memref<24xindex>) {
// expected-error @+1 {{unsupported element type 'index': expected integer or float}}
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24xindex> -> !xegpu.tensor_desc<24xindex>
+ %1 = xegpu.create_nd_tdesc %src : memref<24xindex> -> !xegpu.tensor_desc<24xindex>
return
}
// -----
func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
// expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<8x16xf16>
return
}
-// -----
-func.func @prefetch_nd_vc_2(%src: memref<24xf16>) {
- %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc}}
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
- : !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
- return
-}
-
// -----
func.func @load_nd_vc_1(%src: memref<8x16xf16>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
// expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
return
}
-// -----
-func.func @load_nd_vc_2(%src: memref<16xf16>) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc.}}
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
- : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>> -> vector<8x2xf16>
- return
-}
-
// -----
func.func @load_nd_vc_3(%src: memref<8x16xf16>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
// expected-warning at +1 {{Invalid Packed Attr.}}
%2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<16xf16> -> vector<16xf16>
@@ -121,7 +99,7 @@ func.func @load_nd_vc_3(%src: memref<8x16xf16>) {
// -----
func.func @load_nd_vc_4(%src: memref<24x32xf32>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32>
// expected-error at +1 {{Result shape [8, 1] is not consistent with tensor descriptor}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -132,7 +110,7 @@ func.func @load_nd_vc_4(%src: memref<24x32xf32>) {
// -----
func.func @subgroup_load_nd_9(%src: memref<4x8x16xf16>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<4x8x16xf16> -> !xegpu.tensor_desc<4x8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<4x8x16xf16> -> !xegpu.tensor_desc<4x8x16xf16>
// expected-error at +1 {{Expects a 1D or 2D TensorDesc}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8x16xf16> -> vector<4x8x16xf16>
return
@@ -165,7 +143,7 @@ func.func @subgroup_load_nd_offset_3(%src: memref<4x8x16xf16>, %x : index) {
// -----
func.func @load_nd_layout(%src: memref<24x32xf32>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
// expected-error at +1 {{Result shape [3] is not a valid distribution for tensor descriptor}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<3xf32>
@@ -174,7 +152,7 @@ func.func @load_nd_layout(%src: memref<24x32xf32>) {
// -----
func.func @load_nd_simt(%src: memref<24x32xf32>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
%2 = xegpu.load_nd %1 : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8xf32>
return
@@ -183,28 +161,16 @@ func.func @load_nd_simt(%src: memref<24x32xf32>) {
// -----
func.func @store_nd_vc_1(%dst: memref<24x32xf16>) {
%1 = arith.constant dense<1.0>: vector<24x32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
// expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
return
}
-// -----
-func.func @store_nd_vc_2(%dst: memref<16xf16>) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- %1 = arith.constant dense<1.0>: vector<8x2xf16>
- %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc}}
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>
- : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- return
-}
-
// -----
func.func @store_nd_vc_3(%dst: memref<24x32xf16>) {
%1 = arith.constant dense<1.0>: vector<2x24x32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
// expected-error at +1 {{array length is not supported by store_nd}}
xegpu.store_nd %1, %2: vector<2x24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
return
@@ -213,7 +179,7 @@ func.func @store_nd_vc_3(%dst: memref<24x32xf16>) {
// -----
func.func @store_nd_vc_4(%dst: memref<8x24x32xf16>) {
%1 = arith.constant dense<1.0>: vector<8x24x32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0, 0] : memref<8x24x32xf16> -> !xegpu.tensor_desc<8x24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst : memref<8x24x32xf16> -> !xegpu.tensor_desc<8x24x32xf16>
// expected-error at +1 {{Expects a 1D or 2D TensorDesc}}
xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<8x24x32xf16>, !xegpu.tensor_desc<8x24x32xf16>
return
@@ -221,7 +187,7 @@ func.func @store_nd_vc_4(%dst: memref<8x24x32xf16>) {
// -----
func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
- %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.create_nd_tdesc %dst : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
// expected-error at +1 {{Value shape [3] is not a valid distribution for tensor descriptor}}
xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32>
return
@@ -229,7 +195,7 @@ func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
// -----
func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
xegpu.store_nd %data, %1 : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
return
@@ -237,159 +203,31 @@ func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
// -----
func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
- %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
+ %1 = xegpu.create_nd_tdesc %dst : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32>
// expected-error at +1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
xegpu.store_nd %data, %1 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
-// -----
-func.func @update_nd_offset_1(%dst: memref<16xf16>) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
- -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Expects a non-scattered TensorDesc}}
- xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_1(%src: ui64) {
- %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
- // expected-error at +1 {{Expects a scattered TensorDesc}}
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_2(%src: memref<?xf32>) {
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
- // expected-error at +1 {{invalid chunk size}}
- -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 0>>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_3(%src: memref<?xf32>) {
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- // expected-error at +1 {{Memory space mismatch}}
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_4(%src: memref<?xf32>) {
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
- // expected-error at +1 {{expected last dim of tensor to match chunk size}}
- -> !xegpu.tensor_desc<4x5xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4>>
- return
-}
-
-// -----
-func.func @create_tdesc_vc_5(%src: memref<?xf16>) {
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf16>, vector<4xindex>
- // expected-error at +1 {{last dim of tensor to be a multiple of 2}}
- -> !xegpu.tensor_desc<4x3xf16, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
- return
-}
-
-
// -----
func.func @prefetch_vc_1(%src: memref<24x32xf16>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
// expected-error at +1 {{Expects a scattered TensorDesc}}
xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
return
}
-// -----
-func.func @prefetch_vc_2(%src: ui64) {
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- return
-}
-
-// -----
-func.func @create_tdesc_layout_1(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- // expected-error at +1 {{expected layout rank to match tensor rank}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- return
-}
-
-// -----
-func.func @create_tdesc_layout_2(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- // expected-error at +1 {{expected last dim of lane_data to be a multiple of: 2}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x4xf16, #xegpu.scatter_tdesc_attr<chunk_size = 4>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- return
-}
-
-// -----
-func.func @load_gather_simt_1(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<6xf32>
- return
-}
-
-// -----
-func.func @store_scatter_simt_1(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %val = arith.constant dense<2.9>: vector<6xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- return
-}
-
// -----
func.func @load_gather_vc_1(%src: memref<24x32xf16>) {
%0 = arith.constant dense<1>: vector<4xi1>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
// expected-error at +1 {{Expects a scattered TensorDesc}}
%2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
: !xegpu.tensor_desc<4x2xf16>, vector<4xi1> -> vector<4x2xf16>
return
}
-// -----
-func.func @load_gather_vc_2(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<1>: vector<4xi1>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<write_back>}>
- : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- -> vector<4x2xf32>
- return
-}
-
-// -----
-func.func @load_gather_vc_3(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<1>: vector<8xi1>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Mask should match TensorDesc except the chunk size dim}}
- %2 = xegpu.load %1, %0
- : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<8xi1>
- -> vector<4x2xf32>
- return
-}
-
// -----
func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) {
%offsets = arith.constant dense<[0]> : vector<1xindex>
@@ -398,16 +236,6 @@ func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) {
return
}
-// -----
-func.func @prefetch_offset_wi_2(%src: memref<16xf32>) {
- %offsets = arith.constant dense<[0]> : vector<1xindex>
- %1 = xegpu.create_tdesc %src, %offsets : memref<16xf32>, vector<1xindex>
- -> !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
- // expected-error at +1 {{offsets not allowed}}
- xegpu.prefetch %1[%offsets]: !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>>, vector<1xindex>
- return
-}
-
// -----
func.func @prefetch_offset_wi_3(%src: memref<16xf32>) {
// expected-error at +1 {{Expects offsets}}
@@ -533,39 +361,13 @@ func.func @load_gather_offset_wi_1(%src: memref<4x4xf32>) {
func.func @store_scatter_vc_1(%src: memref<24x32xf32>) {
%0 = arith.constant dense<1>: vector<4xi1>
%1 = arith.constant dense<2.9>: vector<4x2xf32>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
+ %2 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
// expected-error at +1 {{Expects a scattered TensorDesc}}
xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
: vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32>, vector<4xi1>
return
}
-// -----
-func.func @store_scatter_vc_2(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
- %0 = arith.constant dense<1>: vector<4xi1>
- %1 = arith.constant dense<2.9>: vector<4x2xf32>
- %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
- xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<streaming>}> : vector<4x2xf32>,
- !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- return
-}
-
-// -----
-func.func @store_scatter_vc_3(%src: ui64) {
- %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
- %0 = arith.constant dense<1>: vector<8xi1>
- %1 = arith.constant dense<2.9>: vector<4x2xf32>
- %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
- -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Mask should match TensorDesc except the chunk size dim}}
- xegpu.store %1, %2, %0 : vector<4x2xf32>,
- !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<8xi1>
- return
-}
-
// -----
func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
// expected-error at +1 {{K-dimension mismatch}}
@@ -608,18 +410,9 @@ func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
return
}
-// -----
-func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
- %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- // expected-error at +1 {{failed to verify that all of {tensorDesc, value, result} have same shape}}
- xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32>
- return
-}
-
// -----
func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{expected non-zero rank tensor}}
!xegpu.tensor_desc<f32>
return
@@ -627,7 +420,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{expected layout rank to match tensor rank}}
!xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [2, 16], lane_data = [1, 1]>>
return
@@ -635,7 +428,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{expected layout rank to match tensor rank}}
!xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
return
@@ -643,7 +436,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}}
!xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
return
@@ -651,7 +444,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}}
!xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
return
@@ -659,7 +452,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>}}
!xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
return
@@ -667,150 +460,13 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
- %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
+ %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute [4, 8] using #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 2]>}}
!xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 2]>>
return
}
// -----
-func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- // expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
- !xegpu.tensor_desc<16xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<lane_layout = [1, 8], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- // expected-error at +1 {{expected last dim of tensor to match chunk size}}
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 4>,
- #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
- // expected-error at +1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}}
- %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
- gpu.return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout and lane_layout to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout and inst_data to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected inst_data and lane_layout to have the same rank}}
- #xegpu.layout<inst_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected lane_data and lane_layout to have the same rank}}
- #xegpu.layout<inst_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2, 1]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_data and sg_layout to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- // expected-error at +1 {{expected layout rank to match tensor rank}}
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<sg_layout = [1], sg_data = [32], inst_data = [16]>>
- return
-}
-
-// -----
-func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout being used with sg_data}}
- #xegpu.layout<sg_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected lane_layout being used with lane_data}}
- #xegpu.layout<inst_data = [16, 2], lane_data = [1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected sg_layout/lane_layout being used with order}}
- #xegpu.layout<inst_data = [16, 2], order = [0, 1]>>
- return
-}
-
-// -----
-func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected order and sg_layout to have the same rank}}
- #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2], order = [0, 1, 2]>>
- return
-}
-
-// -----
-func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected order and lane_layout to have the same rank}}
- #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
- return
-}
-
// -----
#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
// expected-error at +1 {{repeated dim (2) in slice attribute}}
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
index e4b4e22e5cf97..af4a7a892361e 100644
--- a/mlir/test/Dialect/XeGPU/layout.mlir
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -8,29 +8,29 @@
gpu.module @test {
// CHECK: gpu.func @create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
gpu.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
gpu.func @create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 1e9738f44bb66..f18e7c4237104 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -8,8 +8,8 @@
gpu.module @test {
// CHECK: gpu.func @create_nd_tdesc_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
gpu.return
}
@@ -17,47 +17,47 @@ gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+ %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
gpu.return
}
// CHECK: gpu.func @create_nd_tdesc_7(%[[arg0:.*]]: memref<8x24x32x48x64xf32>) {
gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0, 0, 0, 0] : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0, 0, 0] : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32>
gpu.return
}
@@ -88,8 +88,8 @@ gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index,
gpu.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[%arg3, %arg4], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides:[%w, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0, shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides:[%w, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
gpu.return
}
@@ -105,19 +105,19 @@ gpu.func @test_create_nd_tdesc_10(%src: memref<?x?xf16>, %w : index, %h : index,
// CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @prefetch_nd(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: xegpu.prefetch_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
+ xegpu.prefetch_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
gpu.return
}
// CHECK: gpu.func @prefetch_nd_2(%[[arg0:.*]]: memref<48x64xf16>) {
gpu.func @prefetch_nd_2(%src: memref<48x64xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: xegpu.prefetch_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
+ xegpu.prefetch_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
gpu.return
}
@@ -132,140 +132,140 @@ gpu.func @prefetch_nd_offset_1(%src: memref<48x64xf16>, %x : index, %y : index)
// CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
- %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
gpu.return
}
// CHECK: func @simt_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @simt_load_nd(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
gpu.return
}
// CHECK: func @subgroup_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @subgroup_load_nd_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
+ %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
gpu.return
}
// CHECK: func @simt_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @simt_load_nd_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+ %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
gpu.return
}
// CHECK: func @subgroup_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @subgroup_load_nd_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
gpu.return
}
// CHECK: func @simt_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @simt_load_nd_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
gpu.return
}
// CHECK: func @subgroup_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @subgroup_load_nd_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
- %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
gpu.return
}
// CHECK: func @simt_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @simt_load_nd_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
gpu.return
}
// CHECK: func @subgroup_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @subgroup_load_nd_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
+ %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
gpu.return
}
// CHECK: func @simt_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @simt_load_nd_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
+ %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
gpu.return
}
// CHECK: func @subgroup_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @subgroup_load_nd_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x16x16xf16>
gpu.return
}
// CHECK: func @simt_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @simt_load_nd_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<32xf16>
gpu.return
}
// CHECK: func @subgroup_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @subgroup_load_nd_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16x2xf16>
- %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x8x16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16x2xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x8x16x2xf16>
gpu.return
}
// CHECK: func @simt_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @simt_load_nd_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<32xf16>
gpu.return
}
// CHECK: func @subgroup_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
gpu.return
}
@@ -289,10 +289,10 @@ gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) {
// CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
+ %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
gpu.return
}
@@ -310,10 +310,10 @@ gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) {
gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
%1 = arith.constant dense<1.0>: vector<24x32xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+ xegpu.store_nd %1, %2[0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
gpu.return
}
@@ -321,10 +321,10 @@ gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) {
gpu.func @simt_store_nd(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16>
%1 = arith.constant dense<1.0>: vector<48xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
+ xegpu.store_nd %1, %2[0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
gpu.return
}
@@ -343,10 +343,10 @@ gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>, %x : index) {
gpu.func @subgroup_store_nd_offset_1(%dst: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
%1 = arith.constant dense<1.0>: vector<32xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
- %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]][0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<32xf16>, !xegpu.tensor_desc<32xf16>
+ xegpu.store_nd %1, %2[0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32xf16>, !xegpu.tensor_desc<32xf16>
gpu.return
}
@@ -354,10 +354,10 @@ gpu.func @subgroup_store_nd_offset_1(%dst: memref<24x32xf16>) {
gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
%1 = arith.constant dense<1.0>: vector<2xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ %2 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]][0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16>
+ xegpu.store_nd %1, %2[0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16>
gpu.return
}
@@ -374,8 +374,8 @@ gpu.func @simt_store_nd_offset_1(%src: memref<24x32xf16>) {
// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
%2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
gpu.return
@@ -383,139 +383,13 @@ gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) {
// CHECK: gpu.func @update_nd_tdesc_2(%[[arg0:.*]]: memref<8x24x32xf32>) {
gpu.func @update_nd_tdesc_2(%src: memref<8x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32>
// CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 0, 16] : !xegpu.tensor_desc<2x8x16xf32>
%2 = xegpu.update_nd_offset %1, [0, 0, 16]: !xegpu.tensor_desc<2x8x16xf32>
gpu.return
}
-// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) {
-gpu.func @create_tdesc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @create_tdesc_1(%src: memref<?xf32, 3>) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @create_tdesc_2(%src: memref<?xf32>) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) {
-gpu.func @create_tdesc_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-// CHECK: gpu.func @create_tdesc_4(%[[arg0:.*]]: ui64) {
-gpu.func @create_tdesc_4(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex>
- %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
-
-// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<4x2xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<4x2xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) {
-gpu.func @simt_load(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) {
-gpu.func @simt_load_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<4x8xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<4x8xf16>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) {
-gpu.func @simt_load_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<8xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<8xf16>
- gpu.return
-}
// CHECK: gpu.func @simt_load_4(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) {
gpu.func @simt_load_4(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) {
@@ -545,19 +419,6 @@ gpu.func @simt_load_7(%arg0: memref<256xf16>, %arg1: index, %arg2: i1) {
gpu.return
}
-// CHECK: gpu.func @subgroup_load_4(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_load_4(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex>
- %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<2x4xi1>
- %1 = arith.constant dense<1>: vector<2x4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<2x4xi1> -> vector<2x4x8xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<2x4xi1> -> vector<2x4x8xf16>
- gpu.return
-}
-
// CHECK: gpu.func @subgroup_load_offset_1(%arg0: memref<?xf16>) {
gpu.func @subgroup_load_offset_1(%src: memref<?xf16>) {
%offset = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
@@ -568,96 +429,6 @@ gpu.func @subgroup_load_offset_1(%src: memref<?xf16>) {
gpu.return
}
-// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32>
- %2 = arith.constant dense<2.9>: vector<4x2xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) {
-gpu.func @simt_store(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2xf32>
- %2 = arith.constant dense<2.9>: vector<2xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4x2xf16>
- %2 = arith.constant dense<2.9>: vector<4x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) {
-gpu.func @simt_store_2(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2xf16>
- %2 = arith.constant dense<2.9>: vector<2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4xf32>
- %2 = arith.constant dense<2.9>: vector<4xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- gpu.return
-}
-
-// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) {
-gpu.func @simt_store_3(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
- %1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
- %2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
- gpu.return
-}
-
// CHECK: gpu.func @simt_store_4(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: vector<1xindex>, %[[arg3:.*]]: vector<1xi1>) {
gpu.func @simt_store_4(%arg0: vector<8xf16>, %arg1: memref<256xf16>, %arg2: vector<1xindex>, %arg3: vector<1xi1>) {
// CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
@@ -686,21 +457,6 @@ gpu.func @simt_store_7(%arg0: f16, %arg1: memref<256xf16>, %arg2: index, %arg3:
gpu.return
}
-// CHECK: gpu.func @subgroup_store_4(%[[arg0:.*]]: ui64) {
-gpu.func @subgroup_store_4(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex>
- %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex>
- //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<2x4xi1>
- %1 = arith.constant dense<1>: vector<2x4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2x4xf32>
- %2 = arith.constant dense<2.9>: vector<2x4xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x4xf32>, !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>, vector<2x4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x4xf32>, !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>, vector<2x4xi1>
- gpu.return
-}
-
// CHECK: gpu.func @subgroup_store_offset_1(%arg0: memref<?xf16>) {
gpu.func @subgroup_store_offset_1(%dest: memref<?xf16>) {
%val = arith.constant dense<2.9>: vector<4x2xf16>
@@ -712,17 +468,6 @@ gpu.func @subgroup_store_offset_1(%dest: memref<?xf16>) {
gpu.return
}
-// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) {
-gpu.func @prefetch(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- gpu.return
-}
-
// CHECK: gpu.func @prefetch_offset(%[[arg0:.*]]: ui64) {
gpu.func @prefetch_offset(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
@@ -732,19 +477,6 @@ gpu.func @prefetch_offset(%src: ui64) {
gpu.return
}
-// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) {
-gpu.func @create_update_tdesc(%src: ui64) {
- //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xindex>
- %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xindex>
- gpu.return
-}
-
// CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
// CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
@@ -766,17 +498,6 @@ gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>)
gpu.return
}
-// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
-gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
- //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
- xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
- gpu.return
-}
-
// CHECK: gpu.func @alloc_nbarrier({{.*}}) {
gpu.func @alloc_nbarrier() {
// CHECK: xegpu.alloc_nbarrier
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 9de2881d05d0b..9183cd99c6afd 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -6,11 +6,11 @@
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
-// CHECK: xegpu.prefetch_nd %[[TDESC_SRC]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [8, 16]>}> :
+// CHECK: xegpu.prefetch_nd %[[TDESC_SRC]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [8, 16]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout<inst_data = [8, 16]>}>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>
// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
-// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]][0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.module @test {
// Although the uArch allows 8x32 inst data using block count (or array_len),
// it is up to optimization passes to decide on the block count usage.
@@ -18,9 +18,9 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
%1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
- xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x32xf32>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32>
- xegpu.store_nd %2, %1 : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
+ xegpu.prefetch_nd %0[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x32xf32>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32>
+ xegpu.store_nd %2, %1[0, 0] : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
return
}
}
@@ -30,28 +30,28 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf
// CHECK-LABEL: func.func @dpas_f16(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
-// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]][0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<inst_data = [16, 16]>}> :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]][0, 0] <{layout = #xegpu.layout<inst_data = [16, 16]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<inst_data = [8, 16]>, layout_b = #xegpu.layout<inst_data = [16, 16]>, layout_cd = #xegpu.layout<inst_data = [8, 16]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]][0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.module @test {
func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -66,23 +66,23 @@ gpu.module @test_kernel {
%block_id_y = gpu.block_id y
%m = arith.muli %block_id_x, %c32 : index
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
- %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
+ //CHECK: xegpu.load_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
//CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<16x32xf16>
%c = arith.addf %a, %b : vector<16x32xf16>
- //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
- xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+ //CHECK-COUNT: xegpu.store_nd {{.*}}[0, 0] : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
+ xegpu.store_nd %c, %arg2[0, 0]: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
//CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
@@ -105,23 +105,23 @@ gpu.module @test_kernel {
%block_id_y = gpu.block_id y
%m = arith.muli %block_id_x, %c32 : index
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
- %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [4, 16]>}> :
+ //CHECK: xegpu.load_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [4, 16]>}> :
//CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+ %a = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
//CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : vector<12x32xf16>
%c = arith.addf %a, %b : vector<12x32xf16>
- //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
- xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
+ //CHECK-COUNT: xegpu.store_nd {{.*}}[0, 0] : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
+ xegpu.store_nd %c, %arg2[0, 0]: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
//CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index f4859fe324b19..075254513975a 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -4,26 +4,26 @@ gpu.module @test {
// CHECK-LABEL: func.func @dpas_f16(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -37,8 +37,8 @@ gpu.module @test {
func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
- %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
- xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
+ %1 = xegpu.create_nd_tdesc %arg2 : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
+ xegpu.store_nd %0, %1[0, 0] : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
return
}
}
@@ -47,18 +47,18 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
gpu.module @test {
// CHECK-LABEL: func.func @load_with_transpose_effect(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}}[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %3 = xegpu.load_nd %1 <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[0, 0] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -71,14 +71,14 @@ gpu.module @test {
func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
%5 = xegpu.dpas %2, %4, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %5, %6[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -91,8 +91,8 @@ gpu.module @test {
// CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf32>
// CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf32> to vector<16x16xf16>
func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
- %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %1 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32>
%3 = arith.truncf %2 : vector<16x16xf32> to vector<16x16xf16>
%4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
@@ -101,84 +101,7 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
}
// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @load_gather_with_chunksize(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
-func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %2 = xegpu.create_tdesc %arg1, %cst : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
- %3 = xegpu.load %2, %cst_0 : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
- %4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
- %5 = xegpu.dpas %1, %4 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- return
-}
-}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @load_gather_1d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
-// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
-// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
-// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> :
-// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
-func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
- %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- %1 = xegpu.load %0, %cst_0 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
- xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
- return
-}
-}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @store_scatter_with_chunksize(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
-// CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store %{{.*}}, %[[T0]], %{{.*}} : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<16xi1>
-func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
- %cst = arith.constant dense<1.000000e+00> : vector<16x8xf32>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %0 = xegpu.create_tdesc %arg0, %cst_1 : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
- xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
- return
-}
-}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @store_scatter_1d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1>
-func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
- %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
- %cst_0 = arith.constant dense<true> : vector<16xi1>
- %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
- return
-}
-}
-// -----
gpu.module @test {
// CHECK-LABEL: func.func @scatter_ops_chunksize(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
@@ -257,9 +180,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
-// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
+// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
-// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>
+// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16>
// CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x16xi16> to vector<8x16xf16>
@@ -267,69 +190,69 @@ gpu.module @test {
// CHECK-SAME: vector<16x16xi16> to vector<16x16xf16>
func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xi16> -> !xegpu.tensor_desc<16x16xi16>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
- %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xi16> -> vector<16x16xi16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xi16> -> !xegpu.tensor_desc<16x16xi16>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
+ %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xi16> -> vector<16x16xi16>
%4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x16xf16>
%5 = vector.bitcast %3 : vector<16x16xi16> to vector<16x16xf16>
%6 = xegpu.dpas %4, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %6, %7[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[0, 0] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: vector<16x8xi32> to vector<16x16xf16>
func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32>
%4 = vector.bitcast %3 : vector<16x8xi32> to vector<16x16xf16>
%5 = vector.transpose %4, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
%6 = xegpu.dpas %2, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %6, %7[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>
// CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x32xi16> to vector<8x16xi32>
func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xi16> -> vector<8x32xi16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x32xi16> -> vector<8x32xi16>
%3 = vector.bitcast %2 : vector<8x32xi16> to vector<8x16xi32>
- xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
+ xegpu.store_nd %3, %1[0, 0] : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
return
}
}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[0, 0] : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
// CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x16xi32> to vector<8x32xi16>
func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %arg1: memref<8x32xi16>) {
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16>
+ %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
%3 = vector.bitcast %2 : vector<8x16xi32> to vector<8x32xi16>
- xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
+ xegpu.store_nd %3, %1[0, 0] : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
return
}
}
@@ -340,18 +263,18 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
- %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %2 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %1 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %2 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%3 = arith.addf %1, %2 : vector<16x16xf16>
%4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %arg2[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -364,16 +287,16 @@ gpu.module @test {
// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
- %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %1 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%cst = arith.constant dense<1.000000e+00> : vector<16x16xf16>
%2 = arith.addf %1, %cst : vector<16x16xf16>
%3 = xegpu.dpas %0, %2 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- xegpu.store_nd %3, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %3, %arg2[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %2, %arg3[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -381,14 +304,14 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !
gpu.module @test {
// CHECK-LABEL: func.func @for_op(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
@@ -397,25 +320,25 @@ gpu.module @test {
// CHECK-NEXT: scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>
// CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c16 = arith.constant 16 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
%2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %0, %arg5 = %1, %arg6 = %cst) -> (!xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>) {
- %4 = xegpu.load_nd %arg4 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %5 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = xegpu.load_nd %arg4[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %5 = xegpu.load_nd %arg5[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%6 = xegpu.dpas %4, %5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
%7 = xegpu.update_nd_offset %arg4, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16>
%8 = xegpu.update_nd_offset %arg5, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16>
scf.yield %7, %8, %6 : !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>
}
- %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %3 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %2#2, %3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -426,25 +349,25 @@ gpu.module @test {
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16>
// CHECK-NEXT: } else {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16>
// CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
- %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = scf.if %arg2 -> (vector<16x16xf16>) {
- %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
scf.yield %3 : vector<16x16xf16>
} else {
- %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
scf.yield %3 : vector<16x16xf16>
}
%2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %2, %arg3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -456,26 +379,26 @@ gpu.module @test {
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16>
// CHECK-NEXT: } else {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16>
// CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
- %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = scf.if %arg2 -> (vector<16x16xf16>) {
- %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
scf.yield %3 : vector<16x16xf16>
} else {
- %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
scf.yield %3 : vector<16x16xf16>
}
%2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %2, %arg3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %1, %arg4[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -487,7 +410,7 @@ gpu.module @test {
func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
- xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %0, %arg1[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
}
@@ -499,7 +422,7 @@ gpu.module @test {
func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
- xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %0, %arg1[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
}
@@ -507,15 +430,15 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
gpu.module @test {
// CHECK-LABEL: func.func @update_nd_offset_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%1 = arith.constant dense<1.000000e+00> : vector<16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
%2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
- xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %1, %2[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32>
return
}
}
@@ -523,15 +446,15 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
gpu.module @test {
// CHECK-LABEL: func.func @update_nd_offset_2d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
%c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
%1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
%2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
- xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
+ xegpu.store_nd %1, %2[0, 0] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
return
}
}
@@ -539,12 +462,12 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
gpu.module @test {
// CHECK-LABEL: func.func @prefetch_2d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]][0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @prefetch_2d(%arg0: memref<256x256xf16>){
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
- xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.prefetch_nd %0[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -552,12 +475,12 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){
gpu.module @test {
// CHECK-LABEL: func.func @prefetch_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]][0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
func.func @prefetch_1d(%arg0: memref<256xf16>){
%c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
- xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
+ xegpu.prefetch_nd %0[0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
return
}
}
@@ -576,9 +499,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
%c0 = arith.constant 0 : i32
%c16 = arith.constant 16 : i32
%c256 = arith.constant 256 : i32
- %0 = xegpu.create_nd_tdesc %arg0[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %2 = xegpu.create_nd_tdesc %arg1[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.load_nd %0[0] : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+ %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
%3:3 = scf.while (%arg2 = %1, %arg3 = %c0, %arg4 = %0) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>)
-> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) {
@@ -586,10 +509,10 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
scf.condition(%4) %arg2, %arg3, %arg4 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>
} do {
^bb0(%arg2: vector<16xf32>, %arg3: i32, %arg4: !xegpu.tensor_desc<16xf32>):
- xegpu.store_nd %arg2, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %arg2, %2[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32>
%4 = arith.addi %arg3, %c16 : i32
%5 = xegpu.update_nd_offset %arg4, [16] : !xegpu.tensor_desc<16xf32>
- %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+ %6 = xegpu.load_nd %5[0] : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
scf.yield %6, %4, %5 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>
}
return
@@ -600,7 +523,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -609,11 +532,11 @@ gpu.module @test {
func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.0000> : vector<16xf16>
- %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = vector.multi_reduction <add>, %3, %cst [0] : vector<16x16xf16> to vector<16xf16>
%2 = vector.shape_cast %4 : vector<16xf16> to vector<1x16xf16>
%5 = vector.broadcast %2 : vector<1x16xf16> to vector<16x16xf16>
- xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %5, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -622,7 +545,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
@@ -632,11 +555,11 @@ gpu.module @test {
func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.0000> : vector<16xf16>
- %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
%2 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
%5 = vector.broadcast %2 : vector<16x1xf16> to vector<16x16xf16>
- xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %5, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -645,7 +568,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -654,10 +577,10 @@ gpu.module @test {
func.func @vector_broadcast_1d_to_2d_broadcast_along_row(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.0000> : vector<16xf16>
- %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = vector.multi_reduction <add>, %3, %cst [0] : vector<16x16xf16> to vector<16xf16>
%5 = vector.broadcast %4 : vector<16xf16> to vector<16x16xf16>
- xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %5, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -675,11 +598,11 @@ gpu.module @test {
func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.0000> : vector<16xf16>
- %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
%5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
%6 = vector.broadcast %5 : vector<16x1xf16> to vector<16x16xf16>
- xegpu.store_nd %6, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %6, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
@@ -694,7 +617,7 @@ gpu.module @test {
func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16>) {
%cst = arith.constant 0.0000 : f16
%6 = vector.broadcast %cst : f16 to vector<16x16xf16>
- xegpu.store_nd %6, %arg0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %6, %arg0[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 0b6e30e6f95f0..e6f6eb976577b 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -14,18 +14,18 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c16 : index
%n = arith.muli %block_id_y, %c32 : index
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
- %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
+ %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
- %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0[0, 0] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
//CHECK-COUNT-8: xegpu.dpas {{.*}}
%c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
@@ -36,7 +36,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+ xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
}
}
@@ -55,18 +55,18 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c16 : index
%n = arith.muli %block_id_y, %c32 : index
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1>
- %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1>
+ %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1>
- %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0[0, 0] {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
//CHECK-COUNT-8: xegpu.dpas {{.*}}
%c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
@@ -77,7 +77,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
+ xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
gpu.return
}
}
@@ -97,20 +97,20 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c8 : index
%n = arith.muli %block_id_y, %c32 : index
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1>
//CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
- %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32>
+ %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32>
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1>
- %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2>
%out:3 = scf.for %k = %c0 to %c1024 step %c16
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) {
//CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
+ %a = xegpu.load_nd %arg0[0, 0] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
//CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
%c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
//CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1>
@@ -120,7 +120,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>
}
//CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
+ xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
gpu.return
}
}
@@ -140,18 +140,18 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c16 : index
%n = arith.muli %block_id_y, %c32 : index
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
- %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
+ %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
- %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0[0, 0] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
//CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
%e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
//CHECK-COUNT-8: xegpu.dpas {{.*}}
@@ -164,7 +164,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+ xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
}
}
@@ -180,22 +180,22 @@ gpu.module @test_kernel {
%block_id_y = gpu.block_id y
%m = arith.muli %block_id_x, %c32 : index
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
- %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) {
//CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
- %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0[0, 0] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+ %b = xegpu.load_nd %arg1[0, 0] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
//CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
%c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
- xegpu.store_nd %c, %arg2 {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
+ xegpu.store_nd %c, %arg2[0, 0] {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
//CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l>
@@ -219,22 +219,22 @@ gpu.module @test_kernel {
%block_id_y = gpu.block_id y
%m = arith.muli %block_id_x, %c32 : index
- %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
- %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
- %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) {
//CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16>
- %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
- %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+ %a = xegpu.load_nd %arg0[0] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+ %b = xegpu.load_nd %arg1[0] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
//CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
%c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
- xegpu.store_nd %c, %arg2 {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
+ xegpu.store_nd %c, %arg2[0] {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
//CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l>
@@ -256,13 +256,13 @@ gpu.module @test_kernel {
%c64 = arith.constant 64 : index
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c64 : index
- %0 = xegpu.create_nd_tdesc %a[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
- %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
+ %0 = xegpu.create_nd_tdesc %a : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
+ %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32>
// CHECK-COUNT-3: vector.multi_reduction <add>, {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32>
%2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32>
- %3 = xegpu.create_nd_tdesc %b[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
- xegpu.store_nd %2, %3 {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
+ %3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
+ xegpu.store_nd %2, %3[0] {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
gpu.return
}
}
@@ -281,15 +281,15 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c32 : index
%n = arith.muli %block_id_y, %c32 : index
- %0 = xegpu.create_nd_tdesc %a[%m, %n] : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l>
- %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32>
+ %0 = xegpu.create_nd_tdesc %a : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l>
+ %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32>
// CHECK-COUNT-1: vector.multi_reduction <add>, {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32>
%2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32>
- %3 = xegpu.create_nd_tdesc %b[%n] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
- xegpu.store_nd %2, %3 {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
+ %3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
+ xegpu.store_nd %2, %3[0] {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
gpu.return
}
}
@@ -303,12 +303,12 @@ gpu.module @test_kernel {
%c64 = arith.constant 64 : index
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c64 : index
- %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
- %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
+ %0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
+ %1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
// CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32>
%2 = vector.broadcast %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32>
- %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
- xegpu.store_nd %2, %3 {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
+ %3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
+ xegpu.store_nd %2, %3[0, 0] {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
gpu.return
}
}
@@ -322,13 +322,13 @@ gpu.module @test_kernel {
%c32 = arith.constant 32 : index
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c32 : index
- %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
- %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
+ %0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
+ %1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
%11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32>
// CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
%2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32>
- %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l>
- xegpu.store_nd %2, %3: vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l>
+ %3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l>
+ xegpu.store_nd %2, %3[0, 0]: vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l>
gpu.return
}
}
@@ -342,12 +342,12 @@ gpu.module @test_kernel {
%c32 = arith.constant 32 : index
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c32 : index
- %0 = xegpu.create_nd_tdesc %a[%m, 0] : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l>
- %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
+ %0 = xegpu.create_nd_tdesc %a : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l>
+ %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
// CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32>
%2 = vector.transpose %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32>
- %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t>
- xegpu.store_nd %2, %3 {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
+ %3 = xegpu.create_nd_tdesc %b : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t>
+ xegpu.store_nd %2, %3[0, 0] {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
gpu.return
}
}
@@ -407,154 +407,6 @@ gpu.module @test_kernel {
// -----
-gpu.module @test_kernel {
- // CHECK-LABEL: test_prefetch_load_store_update
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}}
- // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
- // CHECK-COUNT-2: xegpu.load {{.*}}
- // CHECK-COUNT-2: xegpu.store {{.*}}
-
- gpu.func @test_prefetch_load_store_update(%src: ui64) {
-
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-
- %delta = arith.constant dense<[
- 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 64,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 256
- ]> : vector<32xindex>
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
-
- %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
- xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}:
- vector<32xf32>,
- !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>,
- vector<32xi1>
-
- gpu.return
- }
-
-}
-
-// -----
-gpu.module @test_kernel {
- // CHECK-LABEL: test_prefetch_load_store_update_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
- // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
- // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
-
- gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) {
-
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16, 2]>}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-
- %delta = arith.constant dense<[
- 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 64,
- 128, 128, 128, 128, 128, 128, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 256
- ]> : vector<32xindex>
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
-
- %st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>:
- vector<32x4xf32>,
- !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>,
- vector<32xi1>
-
- gpu.return
- }
-}
-
-// -----
-#l = #xegpu.layout<inst_data = [2, 8, 2]>
-
-// test the blocking pass on a 3D scattered tensor descriptor,
-// Ops working 4x8x4xf32 scattered tensor_descs will be unrolled
-// into 4 ops working 2x8x2xf32 scattered tensor_descs based on
-// the given layout.
-gpu.module @test_kernel {
- // CHECK-LABEL: test_3d_scattered_tensor_desc
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK: [[cst_1:%.+]] = arith.constant dense<{{.*}}[130, 138, 146, 154, 162, 170, 178, 186], [194, 202, 210, 218, 226, 234, 242, 250]]> : vector<2x8xindex>
- // CHECK: [[cst_2:%.+]] = arith.constant dense<{{.*}}[2, 10, 18, 26, 34, 42, 50, 58], [66, 74, 82, 90, 98, 106, 114, 122]]> : vector<2x8xindex>
- // CHECK: [[cst_3:%.+]] = arith.constant dense<{{.*}}[0, 8, 16, 24, 32, 40, 48, 56], [64, 72, 80, 88, 96, 104, 112, 120]]> : vector<2x8xindex>
- // CHECK: [[cst_4:%.+]] = arith.constant dense<{{.*}}[128, 136, 144, 152, 160, 168, 176, 184], [192, 200, 208, 216, 224, 232, 240, 248]]> : vector<2x8xindex>
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<2x8xindex> -> !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<2x8xindex>
- // CHECK-COUNT-4: xegpu.load {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<2x8xi1> -> vector<2x8x2xf32>
- // CHECK-COUNT-4: xegpu.store {{.*}} : vector<2x8x2xf32>, !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<2x8xi1>
-
-
- gpu.func @test_3d_scattered_tensor_desc(%src: ui64) {
- %cst = arith.constant dense<[
- [0, 8, 16, 24, 32, 40, 48, 56],
- [64, 72, 80, 88, 96, 104, 112, 120],
- [128, 136, 144, 152, 160, 168, 176, 184],
- [192, 200, 208, 216, 224, 232, 240, 248]
- ]> : vector<4x8xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
- xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
-
- %delta = arith.constant dense<[
- [32, 32, 32, 32, 32, 32, 32, 32],
- [32, 32, 32, 32, 32, 32, 32, 64],
- [128, 128, 128, 128, 128, 128, 128, 128],
- [128, 128, 128, 128, 128, 128, 128, 256]
- ]> : vector<4x8xindex>
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xindex>
-
- %c4 = arith.constant 4: index
- %mask = vector.create_mask %c4, %c4: vector<4x8xi1>
-
- %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
-
- %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>:
- vector<4x8x4xf32>,
- !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>,
- vector<4x8xi1>
- gpu.return
- }
-}
-
-// -----
#a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
#b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
#c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
@@ -564,30 +416,29 @@ gpu.module @test_kernel {
//CHECK-SAME: [[arg0:%.+]]: memref<16x16xf16>, [[arg1:%.+]]: memref<16x16xf16>, [[arg2:%.+]]: memref<16x16xf32>
//CHECK: [[c8:%.+]] = arith.constant 8 : index
//CHECK: [[c0:%.+]] = arith.constant 0 : index
- //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
- //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
- //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
- //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+ //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+ //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]][[[c0]], [[c0]]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]][[[c0]], [[c0]]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
//CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
//CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
//CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
//CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]]
//CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]]
- //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc_1]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: [[c_tdesc:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc]][[[c0]], [[c0]]] {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+ //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc]][[[c8]], [[c0]]] {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
gpu.func @convert_layout(%A: memref<16x16xf16>, %B: memref<16x16xf16>, %C: memref<16x16xf32>) {
%c0 = arith.constant 0 : index
- %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
- %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
- %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
- %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+ %a = xegpu.load_nd %a_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %b = xegpu.load_nd %b_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
%e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
%c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
- %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
- xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
+ xegpu.store_nd %c, %c_tdesc[0, 0] {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir
index d32954127fce6..c25b41427437d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir
@@ -1,20 +1,20 @@
// RUN: mlir-opt -xegpu-fold-alias-ops -split-input-file %s | FileCheck %s
-func.func @fold_subview_with_xegpu_create_nd_tdesc(%arg0 : memref<256x256xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) ->(!xegpu.tensor_desc<8x16xf32>) {
+func.func @fold_subview_with_xegpu_create_nd_tdesc(%arg0 : memref<256x256xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) -> vector<8x16xf32> {
%subview = memref.subview %arg0[%arg1, %arg2] [32, 32] [1, 1] :
memref<256x256xf32> to memref<32x32xf32, strided<[256, 1], offset: ?>>
- %0 = xegpu.create_nd_tdesc %subview[%arg3, %arg4] :
+ %0 = xegpu.create_nd_tdesc %subview :
memref<32x32xf32, strided<[256, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
- return %0 : !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.load_nd %0[%arg3, %arg4] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+ return %1 : vector<8x16xf32>
}
-// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
// CHECK: func @fold_subview_with_xegpu_create_nd_tdesc
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<256x256xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index
// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index
// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: index
-// CHECK-DAG: %[[IDX0:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG3]]]
-// CHECK-DAG: %[[IDX1:.+]] = affine.apply #[[MAP]]()[%[[ARG2]], %[[ARG4]]]
-// CHECK: xegpu.create_nd_tdesc %[[ARG0]][%[[IDX0]], %[[IDX1]]] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG0]][%[[ARG1]], %[[ARG2]]] [32, 32] [1, 1] : memref<256x256xf32> to memref<32x32xf32, strided<[256, 1], offset: ?>>
+// CHECK: %[[TDESC:.+]] = xegpu.create_nd_tdesc %[[SUBVIEW]] : memref<32x32xf32, strided<[256, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: %[[LOAD:.+]] = xegpu.load_nd %[[TDESC]][%[[ARG3]], %[[ARG4]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir
index 6eee5a544e3f8..2c9c7cc63c590 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir
@@ -52,9 +52,8 @@ gpu.module @xevm_test {
//-----
// CHECK-LABEL: load_nd_offsets_at_both_places
- // CHECK-COUNT-2: builtin.unrealized_conversion_cast
gpu.func @load_nd_offsets_at_both_places(%src: memref<256x318xf32>) -> vector<24x32xf32> {
- %tdesc = xegpu.create_nd_tdesc %src[16, 8] : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
%ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
gpu.return %ld : vector<24x32xf32>
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index c3be138fef38a..667ec8c7ecc30 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -4,14 +4,12 @@ gpu.module @test {
// CHECK-LABEL: create_nd_tdesc
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
- // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>,
- // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>,
- // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>
// CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {__xegpu_blocking_tile_shape__ = array<i64: 8, 16>, __xegpu_blocking_unpack__}
gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
}
@@ -19,12 +17,12 @@ gpu.module @test {
// CHECK-LABEL: create_nd_tdesc_1d
// CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
- // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
// CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
- // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+ // CHECK-SAME: !xegpu.tensor_desc<16xf32>
// CHECK-SAME: to !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {__xegpu_blocking_tile_shape__ = array<i64: 16>, __xegpu_blocking_unpack__}
gpu.func @create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
- %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
}
@@ -32,10 +30,10 @@ gpu.module @test {
// CHECK-LABEL: update_nd_tdesc
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK-COUNT-6: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf32>
gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
%update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.return %update : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
}
@@ -44,10 +42,10 @@ gpu.module @test {
// CHECK-LABEL: update_nd_tdesc_1d
// CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
- // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
// CHECK-COUNT-2: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16xf32>
gpu.func @update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
- %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
%update = xegpu.update_nd_offset %tdesc, [32] : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
gpu.return %update : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
}
@@ -56,11 +54,11 @@ gpu.module @test {
// CHECK-LABEL: prefetch_nd_tdesc
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK-COUNT-6: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<8x16xf32>
gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
- xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ xegpu.prefetch_nd %tdesc[0, 0] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.return
}
@@ -68,23 +66,23 @@ gpu.module @test {
// CHECK-LABEL: prefetch_nd_tdesc_1d
// CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
- // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
// CHECK-COUNT-4: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<16xf32>
gpu.func @prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
- xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
+ xegpu.prefetch_nd %tdesc[0] : !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
gpu.return
}
//-----
// CHECK-LABEL: load_nd
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
// CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32>
gpu.func @load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
- %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %ld = xegpu.load_nd %tdesc[0, 0]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
gpu.return %ld : vector<24x32xf32>
}
@@ -92,12 +90,12 @@ gpu.module @test {
// CHECK-LABEL: load_nd_1d
// CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
- // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
// CHECK-COUNT-4: [[ld:%.+]] = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
// CHECK-COUNT-4: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<16xf32> into vector<64xf32>
gpu.func @load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
- %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
- %data = xegpu.load_nd %tdesc: !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>> -> vector<64xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
+ %data = xegpu.load_nd %tdesc[0]: !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>> -> vector<64xf32>
gpu.return %data : vector<64xf32>
}
@@ -105,12 +103,12 @@ gpu.module @test {
// CHECK-LABEL: store_nd
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
// CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.func @store_nd(%src: memref<24x32xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
%data = arith.constant dense<9.0> : vector<24x32xf32>
- xegpu.store_nd %data, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ xegpu.store_nd %data, %tdesc[0, 0]: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.return
}
@@ -118,12 +116,12 @@ gpu.module @test {
// CHECK-LABEL: store_nd_1d
// CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
- // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
+ // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
// CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32>
gpu.func @store_nd_1d(%src: memref<64xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
%data = arith.constant dense<9.0> : vector<64xf32>
- xegpu.store_nd %data, %tdesc: vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
+ xegpu.store_nd %data, %tdesc[0]: vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
gpu.return
}
@@ -131,18 +129,18 @@ gpu.module @test {
// CHECK-LABEL: createNd_loadNd_storeNd
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- //CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ //CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
//CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
//CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32>
//CHECK: [[add:%.+]] = arith.addf {{.*}} : vector<24x32xf32>
//CHECK-COUNT-6: [[extract:%.+]] = vector.extract_strided_slice {{.*}} : vector<24x32xf32> to vector<8x16xf32>
//CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.func @createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
%data = arith.constant dense<9.0> : vector<24x32xf32>
- %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
+ %ld = xegpu.load_nd %tdesc[0, 0]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
%add = arith.addf %data, %ld : vector<24x32xf32>
- xegpu.store_nd %add, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+ xegpu.store_nd %add, %tdesc[0, 0]: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.return
}
@@ -158,318 +156,5 @@ gpu.module @test {
%c = xegpu.dpas %a, %b : vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
gpu.return %c : vector<32x32xf32>
}
-
-//-----
-
- // CHECK-LABEL: create_tdesc_vec
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.func @create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- }
-
-//-----
-
- // CHECK-LABEL: create_tdesc_step
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.func @create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
- %step = arith.constant dense<8> : vector<32xindex>
- %seq = vector.step : vector<32xindex>
- %cst = arith.muli %seq, %step : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- }
-
-//-----
-
- // CHECK-LABEL: load
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.load {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
- gpu.func @load(%src: ui64) -> vector<32xf32> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- %ld = xegpu.load %tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
-
- gpu.return %ld : vector<32xf32>
- }
-
-//-----
-
-
- // CHECK-LABEL: load_with_offsets
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
- gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
-
- gpu.return %ld : vector<32xf32>
- }
-
-//-----
-
- // CHECK-LABEL: prefetch
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- gpu.func @prefetch(%src: ui64) {
-
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- gpu.return
- }
-
-//-----
-
- // CHECK-LABEL: store
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.store {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
- gpu.func @store(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %st_vec = arith.constant dense<1023.0>: vector<32xf32>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- xegpu.store %st_vec, %tdesc, %mask: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1>
-
- gpu.return
- }
-
- //-----
-
- // CHECK-LABEL: store_with_offsets
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1>
- gpu.func @store_with_offsets(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %st_vec = arith.constant dense<1023.0>: vector<32xf32>
- xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
-
- gpu.return
- }
-
-//-----
- // CHECK-LABEL: create_tdesc_step_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4 : i64>>
- gpu.func @create_tdesc_step_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>> {
- %step = arith.constant dense<8> : vector<32xindex>
- %seq = vector.step : vector<32xindex>
- %cst = arith.muli %seq, %step : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
- }
-
-//-----
- // CHECK-LABEL: create_tdesc_step_chunk2
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- gpu.func @create_tdesc_step_chunk2(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
- %step = arith.constant dense<8> : vector<32xindex>
- %seq = vector.step : vector<32xindex>
- %cst = arith.muli %seq, %step : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- }
-
-// CHECK-LABEL: create_tdesc_step_chunk3
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
- // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- gpu.func @create_tdesc_step_chunk3(%src: ui64) -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>> {
- %step = arith.constant dense<8> : vector<16xindex>
- %seq = vector.step : vector<16xindex>
- %cst = arith.muli %seq, %step : vector<16xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
- gpu.return %tdesc : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
- }
-
-//-----
- // CHECK-LABEL: load_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
-
- gpu.func @load_chunk(%src: ui64) -> vector<32x4xf32> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- %ld = xegpu.load %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
-
- gpu.return %ld : vector<32x4xf32>
- }
-
-//-----
- // CHECK-LABEL: load_with_offsets_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32>
- // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
- // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
- // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
- // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
- // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
- gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
- gpu.return %ld : vector<32x4xf32>
- }
-
-//-----
- // CHECK-LABEL: store_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
- gpu.func @store_chunk(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<32x4xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16,2]>>, vector<32xi1>
-
- gpu.return
- }
-
-//-----
- // CHECK-LABEL: store_with_offsets_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK: [[cst:%.+]] = arith.constant dense<1.023000e+03> : vector<16x2xf32
- // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
- // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
- // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
- // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
- // CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1>
- gpu.func @store_with_offsets_chunk(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
-
- %c17 = arith.constant 17: index
- %mask = vector.create_mask %c17: vector<32xi1>
-
- %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
- xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
- gpu.return
- }
-
-//-----
- // CHECK-LABEL: prefetch_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- gpu.func @prefetch_chunk(%src: ui64) {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-
- gpu.return
- }
-
-//-----
- // CHECK-LABEL: update_chunk
- // CHECK-SAME: [[arg0:%.+]]: ui64
- // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
- gpu.func @update_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
- %cst = arith.constant dense<[
- 0, 8, 16, 24, 32, 40, 48, 56,
- 64, 72, 80, 88, 96, 104, 112, 120,
- 128, 136, 144, 152, 160, 168, 176, 184,
- 192, 200, 208, 216, 224, 232, 240, 248
- ]> : vector<32xindex>
- %delta = arith.constant dense<32>: vector<32xindex>
- %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
-
- %new_tdesc = xegpu.update_offset %tdesc, %delta
- : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
-
- gpu.return %new_tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- }
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir b/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir
index 0bb7d7d3d8b1b..f0bfddbf3ebff 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir
@@ -210,18 +210,17 @@ func.func @gather_memref_2d(%base: memref<?x?xf32>, %v: vector<2x3xindex>, %mask
// The `xegpu-vector-linearize` pass does not itself affect the XeGPU ops.
// CHECK: gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel {
-// CHECK: %c0 = arith.constant 0 : index
// CHECK: %cst = arith.constant dense<0.000000e+00> : vector<64xf16>
// CHECK: %cst_0 = arith.constant dense<5.000000e+00> : vector<64xf32>
-// CHECK: %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0]
-// CHECK: %1 = xegpu.load_nd %0
+// CHECK: %0 = xegpu.create_nd_tdesc %arg0
+// CHECK: %1 = xegpu.load_nd %0[0, 0]
// CHECK: %2 = vector.shape_cast %1 : vector<8x16xf16> to vector<128xf16>
// CHECK: %3 = vector.shuffle %2, %cst {{.*}} : vector<128xf16>, vector<64xf16>
// CHECK: %4 = vector.shape_cast %3 : vector<128xf16> to vector<8x16xf16>
-// CHECK: %5 = xegpu.create_nd_tdesc %arg1[%c0, %c0]
-// CHECK: %6 = xegpu.load_nd %5
+// CHECK: %5 = xegpu.create_nd_tdesc %arg1
+// CHECK: %6 = xegpu.load_nd %5[0, 0]
// CHECK: %7 = vector.shape_cast %6 : vector<16x16xf16> to vector<256xf16>
// CHECK: %8 = vector.shuffle %7, %cst {{.*}} : vector<256xf16>, vector<64xf16>
// CHECK: %9 = vector.shape_cast %8 : vector<256xf16> to vector<16x16xf16>
@@ -233,8 +232,8 @@ func.func @gather_memref_2d(%base: memref<?x?xf32>, %v: vector<2x3xindex>, %mask
// CHECK: %14 = vector.shuffle %11, %13 {{.*}} : vector<128xf32>, vector<64xf32>
// CHECK: %15 = vector.shape_cast %14 : vector<128xf32> to vector<8x16xf32>
-// CHECK: %16 = xegpu.create_nd_tdesc %arg2[%c0, %c0]
-// CHECK: xegpu.store_nd %15, %16
+// CHECK: %16 = xegpu.create_nd_tdesc %arg2
+// CHECK: xegpu.store_nd %15, %16[0, 0]
// CHECK: gpu.return
gpu.module @test_kernel {
@@ -243,19 +242,19 @@ gpu.module @test_kernel {
%cst_vec_0 = arith.constant dense<0.000000e+00> : vector<8x8xf16>
%cst_vec_1 = arith.constant dense<0.000000e+00> : vector<8x8xf16>
%cst_vec_2 = arith.constant dense<5.000000e+00> : vector<8x8xf32>
- %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 1>>
- %a_val = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 1>> -> vector<8x16xf16>
+ %a_tdesc = xegpu.create_nd_tdesc %A : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 1>>
+ %a_val = xegpu.load_nd %a_tdesc[0, 0] : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<array_length = 1>> -> vector<8x16xf16>
%a_val_0 = vector.insert_strided_slice %cst_vec_0, %a_val{offsets = [0, 0], sizes = [8, 8], strides = [1, 1]}: vector<8x8xf16> into vector<8x16xf16>
- %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 1>>
+ %b_tdesc = xegpu.create_nd_tdesc %B : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 1>>
- %b_val = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 1>> -> vector<16x16xf16>
+ %b_val = xegpu.load_nd %b_tdesc[0, 0] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 1>> -> vector<16x16xf16>
%b_val_0 = vector.insert_strided_slice %cst_vec_1, %b_val{offsets = [0, 0], sizes = [8, 8], strides = [1, 1]}: vector<8x8xf16> into vector<16x16xf16>
%c_val = xegpu.dpas %a_val_0, %b_val_0 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
%c_val_0 = vector.extract_strided_slice %c_val {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<8x16xf32> to vector<8x8xf32>
%c_addf = arith.addf %c_val_0, %cst_vec_2 : vector<8x8xf32>
%c_result = vector.insert_strided_slice %c_addf, %c_val {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<8x8xf32> into vector<8x16xf32>
- %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<array_length = 1>>
- xegpu.store_nd %c_result, %c_tdesc : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ %c_tdesc = xegpu.create_nd_tdesc %C : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<array_length = 1>>
+ xegpu.store_nd %c_result, %c_tdesc[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 6e9711442b92d..a7445982fe85e 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -4,9 +4,9 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: unary_ops_sg_layout_only
gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
-> vector<24x32xf32>
// CHECK: math.exp {{.*}} : vector<12x8xf32>
@@ -22,9 +22,9 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: unary_ops
gpu.func @unary_ops(%a: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
// CHECK: math.exp {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
@@ -40,14 +40,14 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: binary_ops
gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
// CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -65,19 +65,19 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: ternary_ops
gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1>
+ %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi1>
-> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi1>
// CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -95,14 +95,14 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: type_conversion_ops
gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xi32>
-> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
// CHECK: arith.truncf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -120,24 +120,24 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: comparison_ops
gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi32>
+ %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi32>
-> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32>
+ %tdesc_d = xegpu.create_nd_tdesc %d : memref<24x32xi32>
-> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
- %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+ %load_d = xegpu.load_nd %tdesc_d[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
// CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -156,14 +156,14 @@ gpu.module @test_elementwise_ops {
// 1 to N decomposition of elementwise operations
// CHECK-LABEL: elementwise_ops_rr_assignment
gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-> vector<24x32xf32>
// CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
deleted file mode 100644
index 6b8b4f282b744..0000000000000
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ /dev/null
@@ -1,231 +0,0 @@
-// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
-
-gpu.module @test_round_robin_assignment {
- // CHECK-LABEL: create_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
- // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-NOT: xegpu.create_nd_tdesc
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: create_nd_tdesc_with_shared_data
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
- // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
- // CHECK: %[[C4:.*]] = arith.constant 4 : index
- // CHECK: %[[IDX:.*]] = arith.remui %[[SGID]], %[[C4]]
- // CHECK: %[[IDY_DIV:.*]] = arith.divui %[[SGID]], %[[C4]]
- // CHECK: %[[C8:.*]] = arith.constant 8 : index
- // CHECK: %[[IDY:.*]] = arith.remui %[[IDY_DIV]], %[[C8]]
- // CHECK: %[[C16:.*]] = arith.constant 16 : index
- // CHECK: %[[LY:.*]] = arith.muli %[[IDY]], %[[C16]]
- // CHECK: %[[C64:.*]] = arith.constant 64 : index
- // CHECK: %[[LX:.*]] = arith.muli %[[IDX]], %[[C64]]
- // CHECK: %[[C128:.*]] = arith.constant 128 : index
- // CHECK: %[[OFFY:.*]] = arith.remui %[[LY]], %[[C128]]
- // CHECK: %[[C64_1:.*]] = arith.constant 64 : index
- // CHECK: %[[OFFX:.*]] = arith.remui %[[LX]], %[[C64_1]]
- // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
- gpu.return
- }
-
- // CHECK-LABEL: load_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
- // CHECK-NOT: xegpu.load_nd
- %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<256x128xf32>
- gpu.return
- }
-
- // CHECK-LABEL: store_nd
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @store_nd(%src: memref<256x128xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-NOT: xegpu.store_nd
- %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc
- : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: update_nd
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @update_nd(%src: memref<256x128xf32>){
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-NOT: xegpu.update_nd_offset
- %update = xegpu.update_nd_offset %tdesc, [0, 16]
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: dpas
- // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
- gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
- // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
- // CHECK-NOT: xegpu.dpas
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
- -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<256x128xf16>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16>
- -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- -> vector<128x256xf16>
- %dpas = xegpu.dpas %load_a, %load_b
- {layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
- gpu.return
- }
-
- // CHECK-LABEL: prefetch_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
- // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-NOT: xegpu.prefetch_nd
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.prefetch_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: broadcast
- // CHECK-SAME: %[[ARG_0:.*]]: memref<128x1xf32>
- gpu.func @broadcast(%src: memref<128x1xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32>
- -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
- -> vector<128x1xf32>
- // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
- // CHECK-NOT: vector.broadcast
- %broadcast = vector.broadcast %load
- {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
- : vector<128x1xf32> to vector<128x64xf32>
- gpu.return
- }
-
- gpu.func @scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c1 = arith.constant 1 : index
- %c10 = arith.constant 10 : index
- %c0 = arith.constant 0 : index
- %c256 = arith.constant 256 : index
- %c1024 = arith.constant 1024 : index
- %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- // CHECK-LABEL: scf.for
- // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
- %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1)
- -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
- %3 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- xegpu.store_nd %3, %arg3 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
- scf.yield %4, %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- }
- gpu.return
- }
-
- gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c1_i32 = arith.constant 1 : i32
- %c10_i32 = arith.constant 10 : i32
- %c0_i32 = arith.constant 0 : i32
- %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
- %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
- %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
- // CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
- scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
- } do {
- // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
- ^bb0(%arg2: vector<256xf32>, %arg3: i32):
- xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %4 = arith.addi %arg3, %c1_i32 : i32
- %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %6 = xegpu.load_nd %5 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- scf.yield %6, %4 : vector<256xf32>, i32
- }
- gpu.return
- }
-
- gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c10 = arith.constant 10 : index
- %0 = gpu.subgroup_id : index
- %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %3 = arith.cmpi eq, %0, %c10 : index
- // CHECK-LABEL: scf.if
- // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
- %4 = scf.if %3 -> (vector<256xf32>) {
- %5 = xegpu.load_nd %1 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: vector<16xf32>, vector<16xf32>
- scf.yield %5 : vector<256xf32>
- } else {
- %5 = xegpu.load_nd %2 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: vector<16xf32>, vector<16xf32>
- scf.yield %5 : vector<256xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
- xegpu.store_nd %4, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- gpu.return
- }
-
- gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c10 = arith.constant 10 : index
- %id = gpu.subgroup_id : index
-
- %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %d = xegpu.load_nd %t {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}: !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
-
- %0 = arith.cmpi eq, %id, %c10 : index
- // CHECK-LABEL: scf.if
- // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
- %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
- %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
- scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- } else {
- %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
- scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- }
- xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- gpu.return
- }
-
- gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
- %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
- // CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
- // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
- %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
- %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
- target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
- gpu.return
- }
-}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 1fc2328d09046..ac84c106d749c 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -702,8 +702,8 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MUL2:.*]] = arith.muli %[[REM1]], %[[C32:.*]] : index
// CHECK-DAG: %[[REM3:.*]] = arith.remui %[[MUL1]], %[[C256:.*]] : index
// CHECK-DAG: %[[REM4:.*]] = arith.remui %[[MUL2]], %[[C128:.*]] : index
- // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[REM3]], %[[REM4]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32>
- // CHECK-DAG: %[[LOAD_ND:.*]] = xegpu.load_nd %[[TDESC]] : !xegpu.tensor_desc<32x32xf32> -> vector<32x32xf32>
+ // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32>
+ // CHECK-DAG: %[[LOAD_ND:.*]] = xegpu.load_nd %[[TDESC]][%[[REM3]], %[[REM4]]] : !xegpu.tensor_desc<32x32xf32> -> vector<32x32xf32>
// CHECK-DAG: %[[CST_LOCAL:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
// CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_ND]], %[[CST_LOCAL]] [0] : vector<32x32xf32> to vector<32xf32>
// CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<32xf32> to vector<1x32xf32>
@@ -724,9 +724,9 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_CROSS_SG_1]] [0] : vector<8x32xf32> to vector<32xf32>
// CHECK-DAG: arith.addf %[[FINAL_REDUCE]], %[[CST:.*]] : vector<32xf32>
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} dense<0.0> : vector<128xf32>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+ %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
- %load = xegpu.load_nd %tdesc
+ %load = xegpu.load_nd %tdesc[0, 0]
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
-> vector<256x128xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
deleted file mode 100644
index 4f29a686d301f..0000000000000
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ /dev/null
@@ -1,387 +0,0 @@
-// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
-
-gpu.module @test_1_1_assignment {
- // CHECK-LABEL: create_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
- // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
- // CHECK-DAG: %[[REMUX:.*]] = arith.remui %[[SGID]], %[[C4:.*]]
- // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C4:.*]]
- // CHECK-DAG: %[[REMUY:.*]] = arith.remui %[[DIVU]], %[[C8:.*]]
- // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[REMUY]], %[[C32:.*]]
- // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[REMUX]], %[[C32:.*]]
- // CHECK-DAG: %[[MODY:.*]] = arith.remui %[[MULY]], %[[C256:.*]]
- // CHECK-DAG: %[[MODX:.*]] = arith.remui %[[MULX]], %[[C128:.*]]
- // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref
- // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32>
- gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) {
- // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
- // CHECK-DAG: %[[REMUX:.*]] = arith.remui %[[SGID]], %[[C4:.*]]
- // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C4:.*]]
- // CHECK-DAG: %[[REMUY:.*]] = arith.remui %[[DIVU]], %[[C8:.*]]
- // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[REMUY]], %[[C32:.*]]
- // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[REMUX]], %[[C32:.*]]
- // CHECK-DAG: %[[MODY:.*]] = arith.remui %[[MULY]], %[[C256:.*]]
- // CHECK-DAG: %[[MODX:.*]] = arith.remui %[[MULX]], %[[C128:.*]]
- // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: load_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
- // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-SAME: -> vector<32x32xf32>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<256x128xf32>
- gpu.return
- }
-
- // CHECK-LABEL: store_nd
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @store_nd(%src: memref<256x128xf32>) {
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
- // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-SAME: -> vector<32x32xf32>
- // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]]
- // CHECK-SAME: : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<256x128xf32>
- xegpu.store_nd %load, %tdesc
- : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: update_nd
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @update_nd(%src: memref<256x128xf32>){
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
- // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %update = xegpu.update_nd_offset %tdesc, [0, 16]
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: dpas
- gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<128x128xf16>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- -> vector<128x128xf16>
- // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
- %dpas = xegpu.dpas %load_a, %load_b
- {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
- gpu.return
- }
-
- // CHECK-LABEL: dpas_no_sg_data
- gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
- %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
- order = [1, 0]>>
- %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
- order = [1, 0]>>
- -> vector<128x128xf16>
- %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
- order = [1, 0]>>
- %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
- order = [1, 0]>>
- -> vector<128x128xf16>
- // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
- %dpas = xegpu.dpas %load_a, %load_b
- {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>,
- layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
- : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
- gpu.return
- }
-
- // CHECK-LABEL: prefetch_nd_tdesc
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
- gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
- // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.prefetch_nd %[[TDESC]]
- // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.prefetch_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-
- // CHECK-LABEL: dpas_with_no_create_nd_desc
- gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
- // CHECK-NOT: vector<32x32xf32>
- %dpas = xegpu.dpas %a, %b
- {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], inst_data=[8, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], inst_data=[8, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], inst_data=[8, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
- gpu.return
- }
-
- // CHECK-LABEL: broadcast_dim1
- // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32>
- gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x1xf32>
- -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
- -> vector<256x1xf32>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32>
- %broadcast = vector.broadcast %load
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
- : vector<256x1xf32> to vector<256x32xf32>
- gpu.return
- }
-
- // CHECK-LABEL: broadcast_dim0
- // CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32>
- gpu.func @broadcast_dim0(%src: memref<1x128xf32>) {
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<1x128xf32>
- -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<1x128xf32>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- // CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32>
- %broadcast = vector.broadcast %load
- {layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<1x128xf32> to vector<32x128xf32>
- gpu.return
- }
-
- gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
- // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
- // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %block_id_x = gpu.block_id x
- %block_id_y = gpu.block_id y
- %0 = arith.muli %block_id_x, %c128 : index
- %1 = arith.muli %block_id_y, %c128 : index
- %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- %3 = xegpu.load_nd %2 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
- %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
- %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
-
- // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
- // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
- // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
- // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
- // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
- // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
- // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
- // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
- // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
- %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
- -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
- %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
- %10 = xegpu.dpas %8, %9, %arg6
- {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
- layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
- : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
- %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
- %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
- scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
- !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>
- }
- %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32>
- -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- xegpu.store_nd %6#2, %7 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- gpu.return
- }
-
- gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c1_i32 = arith.constant 1 : i32
- %c10_i32 = arith.constant 10 : i32
- %c0_i32 = arith.constant 0 : i32
- %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
- %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-
- // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
- %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
- %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
- // CHECK: scf.condition{{.*}} : vector<16xf32>, i32
- scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
- } do {
- // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32)
- ^bb0(%arg2: vector<256xf32>, %arg3: i32):
- xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %4 = arith.addi %arg3, %c1_i32 : i32
- %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %6 = xegpu.load_nd %5 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
- scf.yield %6, %4 : vector<256xf32>, i32
- }
- gpu.return
- }
-
- gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c10 = arith.constant 10 : index
- %id = gpu.subgroup_id : index
-
- %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
-
- %4 = arith.cmpi eq, %id, %c10 : index
- // CHECK-LABEL: scf.if
- // CHECK-SAME: (vector<16xf32>)
- %5 = scf.if %4 -> (vector<256xf32>) {
- // CHECK-LABEL: xegpu.load_nd
- // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %2 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: vector<16xf32>
- scf.yield %2 : vector<256xf32>
- } else {
- // CHECK-LABEL: xegpu.load_nd
- // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %3 = xegpu.load_nd %1 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: vector<16xf32>
- scf.yield %3 : vector<256xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [16]>}
- xegpu.store_nd %5, %0 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- gpu.return
- }
-
- gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
- %c10 = arith.constant 10 : index
- %id = gpu.subgroup_id : index
-
- %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
-
- %0 = arith.cmpi eq, %id, %c10 : index
- // CHECK-LABEL: scf.if
- // CHECK-SAME: (!xegpu.tensor_desc<16xf32>)
- %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>) {
- // CHECK-LABEL: xegpu.create_nd_tdesc
- // CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
- %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: !xegpu.tensor_desc<16xf32>
- scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- } else {
- // CHECK-LABEL: xegpu.create_nd_tdesc
- // CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
- %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- // CHECK-LABEL: scf.yield
- // CHECK-SAME: !xegpu.tensor_desc<16xf32>
- scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- }
- xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- gpu.return
- }
-
- // CHECK-LABEL: @subgroup_id_range
- gpu.func @subgroup_id_range(%src: memref<256x128xf32>, %src1: memref<128x256xf32>, %src2: memref<128x64xf32>) {
- %sg_id = gpu.subgroup_id : index
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c2 = arith.constant 2 : index
- %c31 = arith.constant 31 : index
- %c3 = arith.constant 3 : index
- %cond1 = arith.cmpi sge, %sg_id, %c0 : index
- %cond2 = arith.cmpi slt, %sg_id, %c1 : index
- %cond = arith.andi %cond1, %cond2 : i1
- scf.if %cond {
- // CHECK-NOT: index.sub
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
- -> vector<256x128xf32>
- } {sg_id_range = #xegpu.range<[0, 32]>}
- %cond3 = arith.cmpi sge, %sg_id, %c2 : index
- %cond4 = arith.cmpi slt, %sg_id, %c31 : index
- %cond5 = arith.andi %cond3, %cond4 : i1
- scf.if %cond5 {
- // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
- // CHECK: %[[C2:.*]] = arith.constant 2 : index
- // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
- %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32>
- -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
- -> vector<128x64xf32>
- %exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
- }{sg_id_range = #xegpu.range<[2, 18]>}
- gpu.return
- }
-
- // CHECK-LABEL: @subgroup_id_range_nested_if
- gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) {
- %sg_id = gpu.subgroup_id : index
- %c1 = arith.constant 1 : i1
- %c3 = arith.constant 3 : index
- %c32 = arith.constant 32 : index
- %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
- -> vector<256x128xf32>
- %cond1 = arith.cmpi sge, %sg_id, %c3 : index
- %cond2 = arith.cmpi slt, %sg_id, %c32 : index
- %cond = arith.andi %cond1, %cond2 : i1
- scf.if %c1 {
- scf.if %cond {
- // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
- // CHECK: %[[C3:.*]] = arith.constant 3 : index
- // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]]
- %td = xegpu.create_nd_tdesc %src1[0, 0] : memref<128x64xf32>
- -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
- %ld = xegpu.load_nd %td
- : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
- -> vector<128x64xf32>
- %exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
- }
- } {sg_id_range = #xegpu.range<[3, 19]>}
- gpu.return
- }
-
- // CHECK-LABEL: distribute_constant
- gpu.func @distribute_constant() {
- // CHECK: arith.constant dense<1.000000e+00> : vector<32x32xf32>
- %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} dense<1.0> : vector<256x128xf32>
- gpu.return
- }
-}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 405e974500e08..5495b8f79f347 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -58,10 +58,10 @@ struct TestXeGPUUnrollingPatterns
xegpu::UnrollOptions options;
options.setNativeShapeFn([&](Operation *op)
-> std::optional<SmallVector<int64_t>> {
- if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
+ if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
- xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
- xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
+ xegpu::UpdateOffsetOp, xegpu::PrefetchOp, xegpu::LoadGatherOp,
+ xegpu::StoreScatterOp>(op)) {
xegpu::TensorDescType tdescTy;
if (auto createNdOp = dyn_cast<xegpu::CreateNdDescOp>(op)) {
tdescTy = createNdOp.getType();
@@ -73,8 +73,6 @@ struct TestXeGPUUnrollingPatterns
tdescTy = loadNdOp.getTensorDescType();
} else if (auto storeNdOp = dyn_cast<xegpu::StoreNdOp>(op)) {
tdescTy = storeNdOp.getTensorDescType();
- } else if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
- tdescTy = createOp.getType();
} else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
tdescTy = updateOp.getTensorDescType();
} else if (auto prefetchOp = dyn_cast<xegpu::PrefetchOp>(op)) {
>From 1e3ab7437974f883187009b749d2d9bbc2824e27 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 5 Feb 2026 19:11:38 +0000
Subject: [PATCH 2/4] Remove XeGPUFoldAliasOps pass
---
.../mlir/Dialect/XeGPU/Transforms/Passes.td | 9 --
.../Dialect/XeGPU/Transforms/Transforms.h | 2 -
.../Dialect/XeGPU/Transforms/CMakeLists.txt | 1 -
.../XeGPU/Transforms/XeGPUFoldAliasOps.cpp | 86 -------------------
.../Dialect/XeGPU/xegpu-fold-alias-ops.mlir | 20 -----
5 files changed, 118 deletions(-)
delete mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp
delete mode 100644 mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index cb71f19da62f0..6d4a568f614bd 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -11,15 +11,6 @@
include "mlir/Pass/PassBase.td"
-def XeGPUFoldAliasOps : Pass<"xegpu-fold-alias-ops"> {
- let summary = "Fold alias ops into XeGPU ops";
- let description = [{
- The pass folds aliasing ops into XeGPU ops that they operate on the original
- source references.
- }];
- let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect"];
-}
-
def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
let summary = "Distribute XeGPU ops to work items";
let description = [{
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index fede329990be4..f9ce4d229b6cc 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -61,8 +61,6 @@ struct UnrollOptions {
}
};
-/// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
-void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
/// Appends patterns for optimizing block load operations into `patterns`.
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU SIMT distribution into `patterns`.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 47a3f371164fd..e4b98794d9217 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,6 +1,5 @@
add_mlir_dialect_library(MLIRXeGPUTransforms
XeGPUBlocking.cpp
- XeGPUFoldAliasOps.cpp
XeGPUSgToWiDistributeExperimental.cpp
XeGPUSubgroupDistribute.cpp
XeGPUUnroll.cpp
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp
deleted file mode 100644
index 0db45895b87b2..0000000000000
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUFoldAliasOps.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-//===- XeGPUFoldAliasOps.cpp - XeGPU alias ops folders ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
-
-#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace xegpu {
-#define GEN_PASS_DEF_XEGPUFOLDALIASOPS
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
-} // namespace xegpu
-} // namespace mlir
-
-#define DEBUG_TYPE "xegpu-fold-alias-ops"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-using namespace mlir;
-
-namespace {
-/// Merges subview operation with xegpu.create_nd_tdesc operation.
-/// NOTE: This pattern is currently disabled because CreateNdDescOp no longer
-/// supports offsets. Offsets should be specified on load/store/prefetch ops.
-/*
-class XegpuCreateNdDescOpSubViewOpFolder final
- : public OpRewritePattern<xegpu::CreateNdDescOp> {
-public:
- using OpRewritePattern<xegpu::CreateNdDescOp>::OpRewritePattern;
-
- LogicalResult matchAndRewrite(xegpu::CreateNdDescOp descOp,
- PatternRewriter &rewriter) const override;
-};
-} // namespace
-
-LogicalResult XegpuCreateNdDescOpSubViewOpFolder::matchAndRewrite(
- xegpu::CreateNdDescOp descOp, PatternRewriter &rewriter) const {
- auto subViewOp = descOp.getSource().getDefiningOp<memref::SubViewOp>();
-
- if (!subViewOp)
- return rewriter.notifyMatchFailure(descOp, "not a subview producer");
- if (!subViewOp.hasUnitStride())
- return rewriter.notifyMatchFailure(descOp, "requires unit strides");
-
- SmallVector<Value> resolvedOffsets;
- affine::resolveIndicesIntoOpWithOffsetsAndStrides(
- rewriter, descOp.getLoc(), subViewOp.getMixedOffsets(),
- subViewOp.getMixedStrides(), subViewOp.getDroppedDims(),
- descOp.getMixedOffsets(), resolvedOffsets);
-
- rewriter.replaceOpWithNewOp<xegpu::CreateNdDescOp>(
- descOp, descOp.getTensorDesc().getType(), subViewOp.getSource(),
- getAsOpFoldResult(resolvedOffsets));
-
- return success();
-}
-*/
-} // namespace
-
-void xegpu::populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns) {
- // XegpuCreateNdDescOpSubViewOpFolder is disabled - CreateNdDescOp no longer supports offsets
- // patterns.add<XegpuCreateNdDescOpSubViewOpFolder>(patterns.getContext());
-}
-
-namespace {
-
-struct XeGPUFoldAliasOpsPass final
- : public xegpu::impl::XeGPUFoldAliasOpsBase<XeGPUFoldAliasOpsPass> {
- void runOnOperation() override;
-};
-
-} // namespace
-
-void XeGPUFoldAliasOpsPass::runOnOperation() {
- RewritePatternSet patterns(&getContext());
- xegpu::populateXeGPUFoldAliasOpsPatterns(patterns);
- (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir
deleted file mode 100644
index c25b41427437d..0000000000000
--- a/mlir/test/Dialect/XeGPU/xegpu-fold-alias-ops.mlir
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: mlir-opt -xegpu-fold-alias-ops -split-input-file %s | FileCheck %s
-
-func.func @fold_subview_with_xegpu_create_nd_tdesc(%arg0 : memref<256x256xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) -> vector<8x16xf32> {
- %subview = memref.subview %arg0[%arg1, %arg2] [32, 32] [1, 1] :
- memref<256x256xf32> to memref<32x32xf32, strided<[256, 1], offset: ?>>
- %0 = xegpu.create_nd_tdesc %subview :
- memref<32x32xf32, strided<[256, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.load_nd %0[%arg3, %arg4] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
- return %1 : vector<8x16xf32>
-}
-
-// CHECK: func @fold_subview_with_xegpu_create_nd_tdesc
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: memref<256x256xf32>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index
-// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index
-// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: index
-// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG0]][%[[ARG1]], %[[ARG2]]] [32, 32] [1, 1] : memref<256x256xf32> to memref<32x32xf32, strided<[256, 1], offset: ?>>
-// CHECK: %[[TDESC:.+]] = xegpu.create_nd_tdesc %[[SUBVIEW]] : memref<32x32xf32, strided<[256, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: %[[LOAD:.+]] = xegpu.load_nd %[[TDESC]][%[[ARG3]], %[[ARG4]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
>From 756891e6dd3149b6406003721ed40f87f477be19 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 5 Feb 2026 19:15:08 +0000
Subject: [PATCH 3/4] More clean up
---
mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 1 -
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 1 -
.../{xegpu-wg-to-sg-unify-ops-rr.mlir => xegpu-wg-to-sg-rr.mlir} | 0
.../XeGPU/{xegpu-wg-to-sg-unify-ops.mlir => xegpu-wg-to-sg.mlir} | 0
4 files changed, 2 deletions(-)
rename mlir/test/Dialect/XeGPU/{xegpu-wg-to-sg-unify-ops-rr.mlir => xegpu-wg-to-sg-rr.mlir} (100%)
rename mlir/test/Dialect/XeGPU/{xegpu-wg-to-sg-unify-ops.mlir => xegpu-wg-to-sg.mlir} (100%)
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index efdc0df199ca7..bd0933d898e00 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -177,7 +177,6 @@ class CreateNdDescToXeVMPattern
matchAndRewrite(xegpu::CreateNdDescOp op,
xegpu::CreateNdDescOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- // CreateNdDescOp no longer supports offsets (version 1 removed)
auto loc = op.getLoc();
auto source = op.getSource();
// Op is lowered to a code sequence that populates payload.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 074bf74d871fc..37d2e54479269 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -251,7 +251,6 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks layout attribute");
- // CreateNdDescOp no longer supports offsets (version 1 removed)
SmallVector<size_t> newRetIndices;
rewriter.setInsertionPoint(warpOp);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
similarity index 100%
rename from mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
rename to mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
similarity index 100%
rename from mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
rename to mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
>From b42dbb05258159b7fd36eb5500f3680678b4973b Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 9 Feb 2026 23:24:06 +0000
Subject: [PATCH 4/4] Remove update_nd_offset & update_offset ops
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 102 -----------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 54 ------
.../XeGPU/Transforms/XeGPUBlocking.cpp | 3 +-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 21 ---
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 85 +--------
.../Transforms/XeGPUWgToSgDistribute.cpp | 29 +---
mlir/test/Dialect/XeGPU/ops.mlir | 19 --
.../XeGPU/propagate-layout-inst-data.mlir | 76 +++-----
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 97 ++++-------
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 162 +++++++-----------
.../Dialect/XeGPU/xegpu-unroll-patterns.mlir | 24 ---
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 8 +-
12 files changed, 126 insertions(+), 554 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 26c1ffb936942..2af9a0d3eea51 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -576,55 +576,6 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
let hasVerifier = 1;
}
-def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
- [Pure, AllTypesMatch<["TensorDesc", "result"]>]> {
- let summary = "It updates the offsets for the TensorDesc.";
- let description = [{The op updates the offset of the given TensorDesc.
- The offsets are relative offset to the current position in the number
- of elements. It will result in a same type TensorDesc as the input.
-
- Example:
- ```
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
- ```
- }];
-
- let arguments = (ins
- XeGPU_TensorDesc: $TensorDesc,
- Variadic<Index>: $offsets,
- DenseI64ArrayAttr: $const_offsets);
-
- let results = (outs XeGPU_TensorDesc: $result);
-
- let extraClassDeclaration = extraBaseClassDeclaration # [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
-
- SmallVector<OpFoldResult> getMixedOffsets() {
- Builder b(getContext());
- return getMixedValues(getConstOffsets(), getOffsets(), b);
- }
-
- size_t getNumOffsets() {
- return getMixedOffsets().size();
- }
-
- OpFoldResult getOffset(unsigned idx) {
- assert(idx < getNumOffsets() && "Invalid out of bound access.");
- return getMixedOffsets()[idx];
- }
- }];
-
- let assemblyFormat = [{
- $TensorDesc `,`
- custom<DynamicIndexList>($offsets, $const_offsets)
- attr-dict `:` qualified(type($result))
- }];
-
- let hasVerifier = 1;
-}
-
def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
let summary = "prefetches a set of scattered data points to cache";
@@ -1087,59 +1038,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
let hasVerifier = 1;
}
-def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
- [AllTypesMatch<["TensorDesc", "result"]>]> {
- let summary = "It updates the offsets for the given tensor descriptor";
-
- let description = [{It behaves similar to `update_nd_offset` in terms that
- it updates offset of a TensorDesc, and the offsets are relative offset to
- the current position in the number of elements. However, `update_nd_offset`
- is to update the start point of a 2D block, so its offset constains two
- elements representing the shift in each dimension. `update_offset` is to
- update the offset per lane, so its offsets contains values representing
- shifts for each lane.
-
- Example:
- ```mlir
- %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %off :
- !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>>, vector<4xindex>
- ```
-
- }];
-
- let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
- XeGPU_OffsetType: $offsets);
- let results = (outs XeGPU_TensorDesc: $result);
-
- let builders = [
- OpBuilder<(ins "mlir::Value": $TensorDesc,
- "llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "mlir::Value": $TensorDesc,
- "llvm::ArrayRef<int64_t>": $offsets)>
- ];
-
- let extraClassDeclaration = [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
-
- mlir::VectorType getOffsetsType() {
- return getOffsets().getType();
- }
-
- size_t getNumOffsets() {
- return getOffsetsType().getNumElements();
- }
- }];
-
- let assemblyFormat = [{
- $TensorDesc `,` $offsets attr-dict `:` qualified(type($TensorDesc)) `,` type($offsets)
- }];
-
- let hasVerifier = 1;
-}
-
def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> {
let summary = "It performs mma computation";
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index ad085f2c9fe9e..22d0eb3e5ef43 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -656,21 +656,6 @@ LogicalResult StoreNdOp::verify() {
return success();
}
-//===----------------------------------------------------------------------===//
-// XeGPU_UpdateNDOffsetOp
-//===----------------------------------------------------------------------===//
-LogicalResult UpdateNdOffsetOp::verify() {
- auto ty = getTensorDescType();
- if (ty.isScattered())
- return emitOpError("Expects a non-scattered TensorDesc.\n");
-
- // number of offsets specified must match the rank of the tensor descriptor
- if (ty.getRank() != (int64_t)getNumOffsets()) {
- return emitOpError("Invalid number of offsets.");
- }
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_PrefetchOp
//===----------------------------------------------------------------------===//
@@ -881,45 +866,6 @@ void StoreScatterOp::build(
l3_hint, layout);
}
-//===----------------------------------------------------------------------===//
-// XeGPU_UpdateOffsetOp
-//===----------------------------------------------------------------------===//
-void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
- mlir::Value tensorDesc,
- llvm::ArrayRef<OpFoldResult> offsets) {
- auto tdescTy = mlir::dyn_cast<TensorDescType>(tensorDesc.getType());
- assert(tdescTy && "Expecting the source is a TensorDescType value.");
- auto loc = tensorDesc.getLoc();
- int64_t size = static_cast<int64_t>(offsets.size());
- auto type = VectorType::get({size}, builder.getIndexType());
- auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
- auto offset = vector::FromElementsOp::create(builder, loc, type, values);
- build(builder, state, tdescTy, tensorDesc, offset);
-}
-
-void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
- Value tensorDesc, llvm::ArrayRef<int64_t> offsets) {
- auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
- build(builder, state, tensorDesc, ofrs);
-}
-
-LogicalResult UpdateOffsetOp::verify() {
- auto tdescTy = getTensorDescType();
- if (!tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.\n");
-
- SmallVector<int64_t> expectedOffsetShape = getShapeOf(tdescTy);
- SmallVector<int64_t> offsetShape = getShapeOf(getOffsetsType());
- if (tdescTy.getChunkSizeAsInt() > 1)
- expectedOffsetShape.pop_back();
-
- if (expectedOffsetShape != offsetShape)
- return emitOpError(
- "Offsets should match TensorDesc except the chunk size dim.");
-
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_DpasOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index bd30d40ff4d64..b77fb3f543eb5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -180,8 +180,7 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
- if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
- xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
+ if (isa<xegpu::CreateNdDescOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
xegpu::StoreMatrixOp>(op))
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 50fb7b572e13d..b7ab8e896d9ae 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -395,10 +395,6 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
- void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
- ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results);
-
void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
@@ -469,9 +465,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](xegpu::LoadGatherOp loadGatherOp) {
visitLoadGatherOp(loadGatherOp, operands, results);
})
- .Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) {
- visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
- })
.Case([&](xegpu::PrefetchNdOp prefetchNdOp) {
visitPrefetchNdOp(prefetchNdOp, operands, results);
})
@@ -723,20 +716,6 @@ void LayoutInfoPropagation::visitShapeCastOp(
propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
}
-/// Propagate the layout of the result tensor to the source tensor descriptor
-/// in UpdateNdOffsetOp.
-void LayoutInfoPropagation::visitUpdateNdOffsetOp(
- xegpu::UpdateNdOffsetOp updateNdOffset,
- ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results) {
- // The layout of the result must be present.
- LayoutInfo resultLayout = results[0]->getValue();
- if (!resultLayout.isAssigned())
- return;
- // Propagate the layout to the source operand.
- propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
-}
-
/// Set the layouts for DPAS A, B, and C operands.
void LayoutInfoPropagation::visitDpasOp(
xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 4f81f81cc351a..bad22a16686bd 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -189,34 +189,6 @@ struct UnrollCreateNdOp : public UnrollPattern<xegpu::CreateNdDescOp> {
}
};
-struct UnrollUpdateNdOffsetOp : public UnrollPattern<xegpu::UpdateNdOffsetOp> {
- using UnrollPattern<xegpu::UpdateNdOffsetOp>::UnrollPattern;
- LogicalResult matchAndRewrite(xegpu::UpdateNdOffsetOp op,
- PatternRewriter &rewriter) const override {
- Location loc = op.getLoc();
- xegpu::TensorDescType tdescTy = op.getTensorDescType();
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
- return failure();
-
- SmallVector<Type> convertedTdescTypes =
- getUnrolledTypes(tdescTy, *targetShape);
- SmallVector<Value> convertedTdesc = pack(
- op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
-
- SmallVector<Value> newOps;
- for (auto t : convertedTdesc) {
- auto newOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, loc, t.getType(), t, op.getOffsets(), op.getConstOffsets());
- newOps.push_back(newOp);
- }
- Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
- rewriter.replaceOp(op, castOp);
- return success();
- }
-};
-
struct UnrollPrefetchNdOp : public UnrollPattern<xegpu::PrefetchNdOp> {
using UnrollPattern<xegpu::PrefetchNdOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::PrefetchNdOp op,
@@ -821,59 +793,6 @@ struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
}
};
-struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
- using UnrollPattern<xegpu::UpdateOffsetOp>::UnrollPattern;
- LogicalResult matchAndRewrite(xegpu::UpdateOffsetOp op,
- PatternRewriter &rewriter) const override {
- Location loc = op.getLoc();
- xegpu::TensorDescType tdescTy = op.getTensorDescType();
-
- if (!tdescTy.isScattered())
- return failure();
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
- return failure();
-
- SmallVector<Type> convertedTdescTypes =
- getUnrolledTypes(tdescTy, *targetShape);
- SmallVector<Value> convertedTdesc = pack(
- op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
-
- TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
- VectorType offsetVecTy = offsetVec.getType();
- SmallVector<Type> convertedOffsetTypes;
- SmallVector<Value> convertedOffsetVec;
- SmallVector<Value> newOps;
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
- if (originalChunkSize > 1) {
- auto targetOffsetShape = ArrayRef<int64_t>(*targetShape).drop_back();
- convertedOffsetTypes = getUnrolledTypes(offsetVecTy, targetOffsetShape);
-
- int64_t blockedChunkSize = targetShape->back();
- int64_t numNewChunks = originalChunkSize / blockedChunkSize;
- // the offset is reused across the chunk_size dimension
- for (auto offset : pack(offsetVec, convertedOffsetTypes,
- targetOffsetShape, loc, rewriter))
- convertedOffsetVec.append(numNewChunks, offset);
-
- } else {
- convertedOffsetTypes = getUnrolledTypes(offsetVecTy, *targetShape);
- convertedOffsetVec =
- pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
- }
-
- for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
- auto newOp =
- xegpu::UpdateOffsetOp::create(rewriter, loc, t.getType(), t, o);
- newOps.push_back(newOp);
- }
- Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
- rewriter.replaceOp(op, castOp);
- return success();
- }
-};
-
struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
@@ -958,9 +877,9 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
patterns
- .add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
+ .add<UnrollCreateNdOp, UnrollPrefetchNdOp,
UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollLoadGatherOp,
- UnrollStoreScatterOp, UnrollPrefetchOp, UnrollUpdateOffsetOp,
+ UnrollStoreScatterOp, UnrollPrefetchOp,
UnrollLoadMatrixOp, UnrollStoreMatrixOp,
UnrollLoadGatherOpWithOffset, UnrollStoreScatterOpWithOffsets>(
patterns.getContext(), options);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index b87afd02f385c..e668422a7b860 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -265,28 +265,6 @@ struct WgToSgPrefetchNdOpWithOffset
}
};
-/// This pattern transforms the UpdateNdOffsetOp to update the offsets of a
-/// subgroup descriptor. It creates an UpdateNdOffsetOp op to update the
-/// offsets of the new subgroup src tensor descriptors.
-struct WgToSgUpdateNdOffsetOp
- : public OpConversionPattern<xegpu::UpdateNdOffsetOp> {
- using OpConversionPattern<xegpu::UpdateNdOffsetOp>::OpConversionPattern;
- LogicalResult
- matchAndRewrite(xegpu::UpdateNdOffsetOp op, OneToNOpAdaptor adaptor,
- ConversionPatternRewriter &rewriter) const override {
- llvm::SmallVector<Value> newUpdateTileOffsetOps;
- for (auto tDesc : adaptor.getTensorDesc()) {
- auto newUpdateTileOffsetOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, op.getLoc(), tDesc.getType(), tDesc, op.getOffsets(),
- op.getConstOffsets());
- newUpdateTileOffsetOps.push_back(newUpdateTileOffsetOp);
- }
-
- rewriter.replaceOpWithMultiple(op, {newUpdateTileOffsetOps});
- return success();
- }
-};
-
/// This pattern transforms the DpasOp to work at subgroup level.
struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
@@ -1553,7 +1531,7 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
patterns
.add<WgToSgCreateNdOpNoOffset,
WgToSgLoadNdOpWithOffset, WgToSgStoreNdOpWithOffset,
- WgToSgUpdateNdOffsetOp, WgToSgDpasOp,
+ WgToSgDpasOp,
WgToSgPrefetchNdOpWithOffset, UnrealizedConversionCastOpPattern,
WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp,
WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
@@ -1652,8 +1630,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
return loadOp.getTensorDescType();
if (auto storeOp = dyn_cast<xegpu::StoreNdOp>(op))
return storeOp.getTensorDescType();
- if (auto updateOp = dyn_cast<xegpu::UpdateNdOffsetOp>(op))
- return updateOp.getType();
+
if (auto prefetchOp = dyn_cast<xegpu::PrefetchNdOp>(op))
return prefetchOp.getTensorDescType();
return xegpu::TensorDescType();
@@ -1664,7 +1641,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
};
target.addDynamicallyLegalOp<xegpu::CreateNdDescOp, xegpu::LoadNdOp,
- xegpu::StoreNdOp, xegpu::UpdateNdOffsetOp,
+ xegpu::StoreNdOp,
xegpu::PrefetchNdOp>([=](Operation *op) -> bool {
auto tdescTy = getTensorDescType(op);
auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(tdescTy.getLayout());
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index f18e7c4237104..5d3eda53ef122 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -372,25 +372,6 @@ gpu.func @simt_store_nd_offset_1(%src: memref<24x32xf16>) {
gpu.return
}
-// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-
-// CHECK: gpu.func @update_nd_tdesc_2(%[[arg0:.*]]: memref<8x24x32xf32>) {
-gpu.func @update_nd_tdesc_2(%src: memref<8x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32>
- %1 = xegpu.create_nd_tdesc %src : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 0, 16] : !xegpu.tensor_desc<2x8x16xf32>
- %2 = xegpu.update_nd_offset %1, [0, 0, 16]: !xegpu.tensor_desc<2x8x16xf32>
- gpu.return
-}
-
-
// CHECK: gpu.func @simt_load_4(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) {
gpu.func @simt_load_4(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) {
// CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 9183cd99c6afd..8e6408983aed1 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -59,38 +59,20 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
// -----
gpu.module @test_kernel {
gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %c1024 = arith.constant 1024 : index
- %block_id_x = gpu.block_id x
- %block_id_y = gpu.block_id y
- %m = arith.muli %block_id_x, %c32 : index
-
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
%c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
- %out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
- -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
- //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
- %a = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
- %b = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
-
- //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<16x32xf16>
- %c = arith.addf %a, %b : vector<16x32xf16>
-
- //CHECK-COUNT: xegpu.store_nd {{.*}}[0, 0] : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
- xegpu.store_nd %c, %arg2[0, 0]: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
-
- //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
- %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
- : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
- }
+ //CHECK: xegpu.load_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
+ //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
+ %a = xegpu.load_nd %a_tdesc[0, 0] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+ %b = xegpu.load_nd %b_tdesc[0, 0] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+
+ //CHECK: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<16x32xf16>
+ %c = arith.addf %a, %b : vector<16x32xf16>
+
+ //CHECK: xegpu.store_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
+ xegpu.store_nd %c, %c_tdesc[0, 0]: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
gpu.return
}
}
@@ -98,38 +80,20 @@ gpu.module @test_kernel {
// -----
gpu.module @test_kernel {
gpu.func @elementwise_with_inst_data_12(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %c1024 = arith.constant 1024 : index
- %block_id_x = gpu.block_id x
- %block_id_y = gpu.block_id y
- %m = arith.muli %block_id_x, %c32 : index
-
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
%c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16>
- %out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
- -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [4, 16]>}> :
- //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
- %a = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
- %b = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
-
- //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : vector<12x32xf16>
- %c = arith.addf %a, %b : vector<12x32xf16>
-
- //CHECK-COUNT: xegpu.store_nd {{.*}}[0, 0] : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
- xegpu.store_nd %c, %arg2[0, 0]: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
-
- //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
- %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
- : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
- }
+ //CHECK: xegpu.load_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [4, 16]>}> :
+ //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
+ %a = xegpu.load_nd %a_tdesc[0, 0] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+ %b = xegpu.load_nd %b_tdesc[0, 0] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
+
+ //CHECK: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : vector<12x32xf16>
+ %c = arith.addf %a, %b : vector<12x32xf16>
+
+ //CHECK: xegpu.store_nd {{.*}}[0, 0] <{layout = #xegpu.layout<inst_data = [4, 16]>}> : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
+ xegpu.store_nd %c, %c_tdesc[0, 0]: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 075254513975a..20cf996110381 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -307,21 +307,22 @@ gpu.module @test {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
-// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK: xegpu.load_nd %[[T0]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK: xegpu.load_nd %[[T1]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T2:.*]]:3 = scf.for {{.*}} iter_args({{.*}}) ->
+// CHECK-SAME: (vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>) {
+// CHECK: xegpu.dpas {{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK-NEXT: scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>
-// CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.load_nd %[[T0]][%{{.*}}, %{{.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK: xegpu.load_nd %[[T1]][%{{.*}}, %{{.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK: scf.yield {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK: }
+// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T2]]#2, %[[T3]][0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
@@ -329,16 +330,16 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.create_nd_tdesc %arg1 : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
- %2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %0, %arg5 = %1, %arg6 = %cst) -> (!xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>) {
- %4 = xegpu.load_nd %arg4[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %5 = xegpu.load_nd %arg5[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %6 = xegpu.dpas %4, %5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %7 = xegpu.update_nd_offset %arg4, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16>
- %8 = xegpu.update_nd_offset %arg5, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16>
- scf.yield %7, %8, %6 : !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>
+ %a_init = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %b_init = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %a_init, %arg5 = %b_init, %arg6 = %cst) -> (vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>) {
+ %6 = xegpu.dpas %arg4, %arg5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %4 = xegpu.load_nd %0[%c0, %arg3] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %5 = xegpu.load_nd %1[%arg3, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ scf.yield %4, %5, %6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
}
%3 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %2#2, %3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %2#2, %3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
@@ -426,38 +427,7 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
return
}
}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @update_nd_offset_1d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
- %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
- %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32>
- xegpu.store_nd %1, %2[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32>
- return
-}
-}
-// -----
-gpu.module @test {
-// CHECK-LABEL: func.func @update_nd_offset_2d(
-// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32>
- %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32>
- %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32>
- xegpu.store_nd %1, %2[0, 0] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
- return
-}
-}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @prefetch_2d(
@@ -488,12 +458,12 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
gpu.module @test {
// CHECK-LABEL: func.func @scf_while_and_condition(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
-// CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
-// CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
-// CHECK: scf.condition(%{{.*}}) {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK: %{{.*}}:2 = scf.while ({{.*}}) : (vector<16xf32>, i32)
+// CHECK-SAME: -> (vector<16xf32>, i32) {
+// CHECK: scf.condition(%{{.*}}) {{.*}} : vector<16xf32>, i32
// CHECK-NEXT: } do {
-// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>):
-// CHECK: scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32):
+// CHECK: scf.yield {{.*}} : vector<16xf32>, i32
// CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) {
%c0 = arith.constant 0 : i32
@@ -503,17 +473,16 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
%1 = xegpu.load_nd %0[0] : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
%2 = xegpu.create_nd_tdesc %arg1 : memref<256xf32> -> !xegpu.tensor_desc<16xf32>
- %3:3 = scf.while (%arg2 = %1, %arg3 = %c0, %arg4 = %0) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>)
- -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) {
+ %3:2 = scf.while (%arg2 = %1, %arg3 = %c0) : (vector<16xf32>, i32)
+ -> (vector<16xf32>, i32) {
%4 = arith.cmpi slt, %arg3, %c256 : i32
- scf.condition(%4) %arg2, %arg3, %arg4 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>
+ scf.condition(%4) %arg2, %arg3 : vector<16xf32>, i32
} do {
- ^bb0(%arg2: vector<16xf32>, %arg3: i32, %arg4: !xegpu.tensor_desc<16xf32>):
+ ^bb0(%arg2: vector<16xf32>, %arg3: i32):
xegpu.store_nd %arg2, %2[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32>
%4 = arith.addi %arg3, %c16 : i32
- %5 = xegpu.update_nd_offset %arg4, [16] : !xegpu.tensor_desc<16xf32>
- %6 = xegpu.load_nd %5[0] : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- scf.yield %6, %4, %5 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>
+ %5 = xegpu.load_nd %0[0] : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+ scf.yield %5, %4 : vector<16xf32>, i32
}
return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index e6f6eb976577b..bb9af380f6f29 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -19,21 +19,19 @@ gpu.module @test_kernel {
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+ //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
+ %a_init = xegpu.load_nd %a_tdesc[0, 0] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
+ %b_init = xegpu.load_nd %b_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
- -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
- //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0[0, 0] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
- //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1[0, 0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ iter_args(%arg0 = %a_init, %arg1 = %b_init, %arg2 = %c_init)
+ -> (vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32>) {
//CHECK-COUNT-8: xegpu.dpas {{.*}}
- %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
- //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
- //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c
- : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
+ %c = xegpu.dpas %arg0, %arg1, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ scf.yield %a, %b, %c
+ : vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
@@ -60,21 +58,19 @@ gpu.module @test_kernel {
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2>
+ //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
+ %a_init = xegpu.load_nd %a_tdesc[0, 0] {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
+ //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
+ %b_init = xegpu.load_nd %b_tdesc[0, 0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
- -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) {
- //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0[0, 0] {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
- //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1[0, 0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
+ iter_args(%arg0 = %a_init, %arg1 = %b_init, %arg2 = %c_init)
+ -> (vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32>) {
//CHECK-COUNT-8: xegpu.dpas {{.*}}
- %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
- //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1>
- //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c
- : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>
+ %c = xegpu.dpas %arg0, %arg1, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
+ %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
+ scf.yield %a, %b, %c
+ : vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
@@ -104,20 +100,18 @@ gpu.module @test_kernel {
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2>
+ //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %a_init = xegpu.load_nd %a_tdesc[0, 0] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
+ //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %b_init = xegpu.load_nd %b_tdesc[0, 0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
%out:3 = scf.for %k = %c0 to %c1024 step %c16
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
- -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) {
- //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0[0, 0] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
- //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1[0, 0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
- %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
- //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1>
- //CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<16x32xf16, #l2>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c
- : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>
+ iter_args(%arg0 = %a_init, %arg1 = %b_init, %arg2 = %c_init)
+ -> (vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32>) {
+ %c = xegpu.dpas %arg0, %arg1, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
+ %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
+ %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
+ scf.yield %a, %b, %c
+ : vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32>
}
//CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
@@ -145,23 +139,21 @@ gpu.module @test_kernel {
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+ //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
+ %a_init = xegpu.load_nd %a_tdesc[0, 0] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
+ %b_init = xegpu.load_nd %b_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
%out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
- -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
- //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0[0, 0] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
- //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1[0, 0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ iter_args(%arg0 = %a_init, %arg1 = %b_init, %arg2 = %c_init)
+ -> (vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32>) {
//CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
- %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
+ %e = math.exp %arg0 {layout_result_0 = #a} : vector<16x32xf16>
//CHECK-COUNT-8: xegpu.dpas {{.*}}
- %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
- //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
- //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c
- : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
+ %c = xegpu.dpas %e, %arg1, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ scf.yield %a, %b, %c
+ : vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
xegpu.store_nd %out#2, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
@@ -173,9 +165,7 @@ gpu.module @test_kernel {
#l = #xegpu.layout<inst_data = [8, 16]>
gpu.module @test_kernel {
gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
- %c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
- %c1024 = arith.constant 1024 : index
%block_id_x = gpu.block_id x
%block_id_y = gpu.block_id y
%m = arith.muli %block_id_x, %c32 : index
@@ -184,26 +174,16 @@ gpu.module @test_kernel {
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
%c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l>
- %out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
- -> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) {
- //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0[0, 0] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
- %b = xegpu.load_nd %arg1[0, 0] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
-
- //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
- %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
-
- //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
- xegpu.store_nd %c, %arg2[0, 0] {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
-
- //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l>
- %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
- : !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>
- }
+ //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %a = xegpu.load_nd %a_tdesc[0, 0] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+ %b = xegpu.load_nd %b_tdesc[0, 0] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+
+ //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
+ %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
+
+ //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ xegpu.store_nd %c, %c_tdesc[0, 0] {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
+
gpu.return
}
}
@@ -212,9 +192,7 @@ gpu.module @test_kernel {
#l = #xegpu.layout<inst_data = [8]>
gpu.module @test_kernel {
gpu.func @elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
- %c0 = arith.constant 0 : index
%c32 = arith.constant 32 : index
- %c1024 = arith.constant 1024 : index
%block_id_x = gpu.block_id x
%block_id_y = gpu.block_id y
%m = arith.muli %block_id_x, %c32 : index
@@ -223,26 +201,16 @@ gpu.module @test_kernel {
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
%c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l>
- %out:3 = scf.for %k = %c0 to %c1024 step %c32
- iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
- -> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) {
- //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16>
- %a = xegpu.load_nd %arg0[0] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
- %b = xegpu.load_nd %arg1[0] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
-
- //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
- %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
-
- //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
- xegpu.store_nd %c, %arg2[0] {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
-
- //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16>
- %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l>
- %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32] : !xegpu.tensor_desc<32xf16, #l>
- %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c32] : !xegpu.tensor_desc<32xf16, #l>
- scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
- : !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>
- }
+ //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16>
+ %a = xegpu.load_nd %a_tdesc[0] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+ %b = xegpu.load_nd %b_tdesc[0] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+
+ //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
+ %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
+
+ //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
+ xegpu.store_nd %c, %c_tdesc[0] {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
+
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index 667ec8c7ecc30..69b81ca3c0ccc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -28,30 +28,6 @@ gpu.module @test {
//-----
- // CHECK-LABEL: update_nd_tdesc
- // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
- // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
- // CHECK-COUNT-6: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf32>
- gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
- %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
- %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
- gpu.return %update : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
- }
-
- //-----
-
- // CHECK-LABEL: update_nd_tdesc_1d
- // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
- // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
- // CHECK-COUNT-2: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16xf32>
- gpu.func @update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
- %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
- %update = xegpu.update_nd_offset %tdesc, [32] : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
- gpu.return %update : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
- }
-
- //-----
-
// CHECK-LABEL: prefetch_nd_tdesc
// CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
// CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 5495b8f79f347..0361852229e9a 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -58,23 +58,19 @@ struct TestXeGPUUnrollingPatterns
xegpu::UnrollOptions options;
options.setNativeShapeFn([&](Operation *op)
-> std::optional<SmallVector<int64_t>> {
- if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
+ if (isa<xegpu::CreateNdDescOp,
xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
- xegpu::UpdateOffsetOp, xegpu::PrefetchOp, xegpu::LoadGatherOp,
+ xegpu::PrefetchOp, xegpu::LoadGatherOp,
xegpu::StoreScatterOp>(op)) {
xegpu::TensorDescType tdescTy;
if (auto createNdOp = dyn_cast<xegpu::CreateNdDescOp>(op)) {
tdescTy = createNdOp.getType();
- } else if (auto updateNdOp = dyn_cast<xegpu::UpdateNdOffsetOp>(op)) {
- tdescTy = updateNdOp.getTensorDescType();
} else if (auto prefetchNdOp = dyn_cast<xegpu::PrefetchNdOp>(op)) {
tdescTy = prefetchNdOp.getTensorDescType();
} else if (auto loadNdOp = dyn_cast<xegpu::LoadNdOp>(op)) {
tdescTy = loadNdOp.getTensorDescType();
} else if (auto storeNdOp = dyn_cast<xegpu::StoreNdOp>(op)) {
tdescTy = storeNdOp.getTensorDescType();
- } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
- tdescTy = updateOp.getTensorDescType();
} else if (auto prefetchOp = dyn_cast<xegpu::PrefetchOp>(op)) {
tdescTy = prefetchOp.getTensorDescType();
} else if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
More information about the Mlir-commits
mailing list