[Mlir-commits] [mlir] [MLIR][XeGPU] Add anchor_layout and update propagation to honor user-specified layouts (PR #169267)
Jianhui Li
llvmlistbot at llvm.org
Sun Nov 23 18:33:13 PST 2025
https://github.com/Jianhui-Li created https://github.com/llvm/llvm-project/pull/169267
Introduce anchor layout for XeGPU anchor ops: load_nd, store_nd, prefetch_nd, dpas, load, store, prefetch, load_matrix, store_matrix, and atomic_rmw. Anchor layout is permanent, and is guaranteed to be honored by XeGPU distribution and lowerinngs once specified.
1. Add anchor_layout for XeGPU anchor OPs: load_nd, store_nd, prefetch_nd, dpas, load, store, prefetch, load_matrix, store_matrix, and atomic_rmw.
2. rename layout attributes to anchor_layout for these ops: load, store, load_matrix, store_matrix
3. update layout propagation pass: Only when user doesn't specify anchor layout, the pass computes a default layout and set to anchor op's permant layout and use that for propagation. if user specified anchor layout, the pass takes user-specified anchor layout. permant layout and use that for propagation. if user specified anchor layout, the pass takes user-specified anchor layout.
>From b3f2a4ab3d57ff906e03dd03b6365ba99d2169bf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 22 Nov 2025 00:26:44 +0000
Subject: [PATCH 1/3] adding anchor layout for load/store/prefetch_nd and dpas
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 29 ++++++++++++-------
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 ++
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 28 ++++++++++--------
.../Transforms/XeGPUSubgroupDistribute.cpp | 4 +--
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 4 +--
.../Transforms/XeGPUWgToSgDistribute.cpp | 20 +++++++++----
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 9 +++---
mlir/test/Dialect/XeGPU/invalid.mlir | 6 ++--
.../Dialect/XeGPU/subgroup-distribute.mlir | 12 ++++----
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 4 +--
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 6 ++--
11 files changed, 72 insertions(+), 52 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4c67856b559b1..9ddc408a17f7f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -268,7 +268,8 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let extraClassDeclaration = extraBaseClassDeclaration # [{
xegpu::TensorDescType getTensorDescType() {
@@ -360,7 +361,8 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
OptionalAttr<DenseI64ArrayAttr>: $transpose,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let results = (outs XeGPU_ValueType: $value);
@@ -454,7 +456,8 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
OptionalAttr<DenseI64ArrayAttr>: $const_offsets,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let extraClassDeclaration = extraBaseClassDeclaration # [{
VectorType getValueType() {
@@ -1046,7 +1049,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint,
- "xegpu::DistributeLayoutAttr": $layout)>
+ "xegpu::DistributeLayoutAttr": $anchor_layout)>
];
let hasVerifier = 1;
@@ -1133,7 +1136,11 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
let arguments = (ins
XeGPU_DpasOprType : $lhs,
XeGPU_DpasOprType : $rhs,
- Optional<XeGPU_DpasResType>: $acc);
+ Optional<XeGPU_DpasResType>: $acc,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout_a,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout_b,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout_cd
+ );
let results = (outs XeGPU_DpasResType: $result);
let extraClassDeclaration = [{
@@ -1319,7 +1326,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
Variadic<Index>: $offsets,
DenseI64ArrayAttr: $const_offsets,
OptionalAttr<UnitAttr>:$subgroup_block_io,
- OptionalAttr<DistributeLayoutAttr>:$layout
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout
);
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);
let assemblyFormat = [{
@@ -1338,7 +1345,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
- `subgroup_block_io`: [optional] An attribute indicating that the operation can be
lowered to a subgroup block load. When this attribute is present,
the offsets are subgroup-uniform across all lanes.
- - `layout`: [optional] An attribute for guiding distributions among
+ - `anchor_layout`: [optional] An attribute for guiding distributions among
subgroups and/or work-items. It currently can accept either
LayoutAttr or SliceAttr.
Results:
@@ -1347,7 +1354,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
let builders = [
OpBuilder<(ins "Type":$res, "TypedValue<MemDescType>": $mem_desc,
- "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $layout)>,
+ "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $anchor_layout)>,
];
let extraClassDeclaration = [{
SmallVector<OpFoldResult> getMixedOffsets() {
@@ -1373,7 +1380,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
Variadic<Index>: $offsets,
DenseI64ArrayAttr: $const_offsets,
OptionalAttr<UnitAttr>:$subgroup_block_io,
- OptionalAttr<DistributeLayoutAttr>:$layout
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout
);
let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
prop-dict attr-dict `` `:` type(operands)}];
@@ -1389,13 +1396,13 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
- `subgroup_block_io`: [optional] An attribute indicating that the operation can be
lowered to a subgroup block store. When this attribute is present,
the offsets are subgroup-uniform across all lanes.
- - `layout`: [optional] An attribute for guiding distributions among
+ - `anchor_layout`: [optional] An attribute for guiding distributions among
subgroups and/or work-items. It currently can accept either
LayoutAttr or SliceAttr.
}];
let builders = [
OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
- "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $layout)>,
+ "llvm::ArrayRef<OpFoldResult>": $offsets, "DistributeLayoutAttr": $anchor_layout)>,
];
let extraClassDeclaration = [{
SmallVector<OpFoldResult> getMixedOffsets() {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fb5d1e758dbd1..b3d2c40712c96 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -22,6 +22,8 @@ using std::optional;
namespace mlir {
namespace xegpu {
+//#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc"
+
void XeGPUDialect::initialize() {
addTypes<
#define GET_TYPEDEF_LIST
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 85c9a966f0fe8..3240c0f40ce58 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -465,7 +465,7 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
xegpu::CachePolicyAttr l3_hint) {
return build(builder, state, tensorDesc, ValueRange(), DenseI64ArrayAttr(),
- l1_hint, l2_hint, l3_hint);
+ l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr);
}
void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
@@ -480,7 +480,7 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state,
auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
build(builder, state, tensorDesc, dynamicOffsets, staticOffsetsAttr, l1_hint,
- l2_hint, l3_hint);
+ l2_hint, l3_hint, /*anchor_layout=*/nullptr);
}
LogicalResult PrefetchNdOp::verify() {
@@ -519,7 +519,7 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
return build(builder, state, retType, tensorDesc, ValueRange(),
DenseI64ArrayAttr(), packed, transpose, l1_hint, l2_hint,
- l3_hint);
+ l3_hint, /*anchor_layout=*/nullptr);
}
void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
@@ -535,7 +535,8 @@ void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType,
auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
build(builder, state, retType, tensorDesc, dynamicOffsets, staticOffsetsAttr,
- packed, transpose, l1_hint, l2_hint, l3_hint);
+ packed, transpose, l1_hint, l2_hint, l3_hint,
+ /*anchor_layout=*/nullptr);
}
LogicalResult LoadNdOp::verify() {
@@ -638,7 +639,8 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value,
xegpu::CachePolicyAttr l3_hint) {
return build(builder, state, value, tensorDesc, ValueRange(),
- DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint);
+ DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint,
+ /*anchor_layout=*/nullptr);
}
void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value,
@@ -653,7 +655,7 @@ void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value,
auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
build(builder, state, value, tensorDesc, dynamicOffsets, staticOffsetsAttr,
- l1_hint, l2_hint, l3_hint);
+ l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr);
}
LogicalResult StoreNdOp::verify() {
@@ -876,7 +878,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint) {
build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
- l1_hint, l2_hint, l3_hint, /*layout=*/nullptr);
+ l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr);
}
void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -892,7 +894,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
auto offset = vector::FromElementsOp::create(builder, loc, type, values);
build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
- l2_hint, l3_hint, /*layout=*/nullptr);
+ l2_hint, l3_hint, /*anchor_layout=*/nullptr);
}
void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -960,7 +962,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint) {
build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
- l2_hint, l3_hint, /*layout=*/nullptr);
+ l2_hint, l3_hint, /*anchor_layout=*/nullptr);
}
void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
@@ -978,7 +980,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
// Call the correct builder overload that does not expect result types.
build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
- l3_hint, /*layout=*/nullptr);
+ l3_hint, /*anchor_layout=*/nullptr);
}
void StoreScatterOp::build(
@@ -1155,7 +1157,8 @@ LogicalResult LoadMatrixOp::verify() {
MemDescType mdescTy = getMemDesc().getType();
return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
- getLayoutAttr(), [&]() { return emitError(); });
+ getAnchorLayoutAttr(),
+ [&]() { return emitError(); });
}
//===----------------------------------------------------------------------===//
@@ -1179,7 +1182,8 @@ LogicalResult StoreMatrixOp::verify() {
UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
MemDescType mdescTy = getMemDesc().getType();
return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
- getLayoutAttr(), [&]() { return emitError(); });
+ getAnchorLayoutAttr(),
+ [&]() { return emitError(); });
}
namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4455811a2e681..ac65babfcb4cb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -965,7 +965,7 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
- auto layout = matrixOp.getLayoutAttr();
+ auto layout = matrixOp.getAnchorLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
matrixOp, "the matrix operation lacks layout attribute");
@@ -1041,7 +1041,7 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
- auto layout = matrixOp.getLayoutAttr();
+ auto layout = matrixOp.getAnchorLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
matrixOp, "the matrix operation lacks layout attribute");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 330553564f81a..b0b748c3409c3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -954,7 +954,7 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
Type elemTy = valueTy.getElementType();
ArrayRef<int64_t> shape = valueTy.getShape();
- auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getAnchorLayoutAttr());
VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
@@ -993,7 +993,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
assert(valueTy && "the value type must be vector type!");
ArrayRef<int64_t> shape = valueTy.getShape();
- auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getAnchorLayoutAttr());
SmallVector<Type> convertedValTypes =
getUnrolledTypes(valueTy, *targetShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 33d4b0457e5d3..2562c46adfa8d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -86,8 +86,16 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
if (origOffsets.empty())
return failure();
+ // if op is xegpu::CreateNdDescOp, call op.getLayoutAttr()
+ xegpu::DistributeLayoutAttr layout;
+ if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp> ||
+ std::is_same_v<OpType, xegpu::StoreMatrixOp>) {
+ layout = op.getAnchorLayoutAttr();
+ } else {
+ layout = op.getLayoutAttr();
+ }
+
// not applicable to ops without workgroup layout attributes
- xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -190,7 +198,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
xegpu::TensorDescType tdescTy = op.getType();
ArrayRef<int64_t> wgShape = tdescTy.getShape();
Type elemTy = tdescTy.getElementType();
- xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
auto newTdescTy =
xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(),
@@ -999,7 +1007,7 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
assert(valueTy && "the value type must be vector type!");
Type elemTy = valueTy.getElementType();
- xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr();
SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
VectorType newResTy = VectorType::get(sgShape, elemTy);
SmallVector<Value> newOps;
@@ -1025,7 +1033,7 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern<xegpu::StoreMatrixOp> {
if (failed(genOffsetsList(rewriter, op, offsetsList)))
return failure();
- xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = op.getAnchorLayoutAttr();
for (auto [v, offsets] : llvm::zip(adaptor.getData(), offsetsList))
xegpu::StoreMatrixOp::create(rewriter, op.getLoc(), v, op.getMemDesc(),
offsets, layout.dropSgLayoutAndData());
@@ -1409,12 +1417,12 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
target.addDynamicallyLegalOp<xegpu::LoadMatrixOp>(
[=](xegpu::LoadMatrixOp op) -> bool {
- return isLegal(op.getLayoutAttr());
+ return isLegal(op.getAnchorLayoutAttr());
});
target.addDynamicallyLegalOp<xegpu::StoreMatrixOp>(
[=](xegpu::StoreMatrixOp op) -> bool {
- return isLegal(op.getLayoutAttr());
+ return isLegal(op.getAnchorLayoutAttr());
});
target.addDynamicallyLegalOp<arith::ConstantOp>(
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index b0905c4e9203b..4fe35a16b3994 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -135,12 +135,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
// for LoadMatrixOp, the layout is attached to the property of the op
if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
- return loadOp.getLayoutAttr();
+ return loadOp.getAnchorLayoutAttr();
// for StoreMatrixOp, the layout is attached to the property of the op
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
- return storeOp.getLayoutAttr();
-
+ return storeOp.getAnchorLayoutAttr();
std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
@@ -168,10 +167,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
- return loadOp.getLayoutAttr();
+ return loadOp.getAnchorLayoutAttr();
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
- return storeOp.getLayoutAttr();
+ return storeOp.getAnchorLayoutAttr();
std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 92f353717ac59..62ac880030cda 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -894,7 +894,7 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
// -----
func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) {
// expected-error at +1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}}
- xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+ xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>
return
}
@@ -902,7 +902,7 @@ func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32,
// -----
func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) {
// expected-error at +1 {{With subgroup_block_io, the distributed dimensions must be contiguous}}
- xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
+ xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>
return
}
@@ -910,7 +910,7 @@ func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf3
// -----
func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) {
// expected-error at +1 {{With subgroup_block_io, the block shape must match the lane layout}}
- xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+ xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>
return
}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 8fd3cca5594cb..a7ce2c05b9d44 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -281,8 +281,8 @@ gpu.module @xevm_module{
gpu.module @xevm_module{
gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
%c0 = arith.constant 0 : index
- %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
- xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+ %1 = xegpu.load_matrix %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+ xegpu.store_matrix %1, %arg0[%c0, %c0] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
gpu.return
}
}
@@ -307,8 +307,8 @@ gpu.module @xevm_module{
gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
- %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
- xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+ %1 = xegpu.load_matrix %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+ xegpu.store_matrix %1, %arg0[%c0, %c1] <{anchor_layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
gpu.return
}
}
@@ -323,9 +323,9 @@ gpu.module @xevm_module{
gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
- %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+ %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
!xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
- xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+ xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d61908b422194..456d8e8a03cfc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -569,7 +569,7 @@ gpu.module @test_kernel {
%0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
//CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
//CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
- %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
+ %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
gpu.return %1: vector<32x32xf32>
}
}
@@ -580,7 +580,7 @@ gpu.module @test_kernel {
gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
%mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
// CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
- xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+ xegpu.store_matrix %value, %mdesc[0, 0] {anchor_layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 5dde84e8e0bc2..3760737cf51f5 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -333,9 +333,9 @@ gpu.module @test_distribution {
//CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
//CHECK: [[c128:%.+]] = arith.constant 128 : index
//CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
- //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
+ //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{anchor_layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
%0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
- %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
+ %1 = xegpu.load_matrix %0[0, 0] <{anchor_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
gpu.return
}
@@ -361,7 +361,7 @@ gpu.module @test_distribution {
//CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
%cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} dense<1.0> : vector<64x128xf32>
%mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
- xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
+ xegpu.store_matrix %cst, %mdesc[0, 0] {anchor_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
gpu.return
}
>From bfae01fa3f6453ee1d0f67e98c3d6c2b1fcee8f2 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 22 Nov 2025 07:46:04 +0000
Subject: [PATCH 2/3] propogation hornor pre-defined layout at anchor op
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 +-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 448 +++++++++++-------
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 4 +-
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 8 +-
.../XeGPU/propagate-layout-inst-data.mlir | 16 +-
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 79 +--
.../XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 4 +-
7 files changed, 328 insertions(+), 237 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9ddc408a17f7f..70c61a445e8ae 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -847,7 +847,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
- OptionalAttr<DistributeLayoutAttr>:$layout);
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -906,7 +906,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
"xegpu::CachePolicyAttr": $l1_hint,
"xegpu::CachePolicyAttr": $l2_hint,
"xegpu::CachePolicyAttr": $l3_hint,
- "xegpu::DistributeLayoutAttr": $layout)>
+ "xegpu::DistributeLayoutAttr": $anchor_layout)>
];
let hasVerifier = 1;
@@ -991,7 +991,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
- OptionalAttr<DistributeLayoutAttr>:$layout);
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let extraClassDeclaration = extraBaseClassDeclaration#[{
Type getDestType() {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index b3a780abd3f12..6d45a51ab0267 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -387,6 +387,8 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
+ bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout);
+
public:
LayoutInfoPropagation(DataFlowSolver &solver,
SymbolTableCollection &symbolTable,
@@ -475,49 +477,72 @@ LogicalResult LayoutInfoPropagation::visitOperation(
return success();
}
+bool LayoutInfoPropagation::hasAnchorLayout(
+ xegpu::DistributeLayoutAttr anchorLayout) {
+ if (anchorLayout == nullptr) {
+ return false;
+ }
+ if (layoutKind == LayoutKind::InstData) {
+ return !(anchorLayout.getEffectiveInstDataAsInt().empty());
+ } else if (layoutKind == LayoutKind::Lane) {
+ return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() ||
+ anchorLayout.getEffectiveLaneDataAsInt().empty());
+ }
+ return false;
+}
+
void LayoutInfoPropagation::visitPrefetchNdOp(
xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- // Here we assign the default layout to the tensor descriptor operand of
- // prefetch.
- auto tdescTy = prefetch.getTensorDescType();
-
- auto uArch = getUArch(getChipStr(prefetch).value_or(""));
- const auto *uArchInstruction =
- dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
- uArch->getInstruction(
- xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
-
- auto blockWHC =
- uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
- if (!blockWHC)
- prefetch.emitWarning("No known block params found for the element type.");
- auto [bWidth, bHeight, bCount] = blockWHC.value();
- SmallVector<int> instData;
- int instWidth = xegpu::getLargestDivisor(
- static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
- bCount);
- if (instWidth == -1)
- prefetch.emitWarning(
- "No suitable instruction multiple found for the given shape.");
- if (tdescTy.getRank() == 1)
- instData = {instWidth};
- else {
- int instHeight = xegpu::getLargestDivisor(
- static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
- if (instHeight == -1)
+
+ LayoutInfo prefetchLayout;
+ xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr();
+ if (hasAnchorLayout(anchorLayout)) {
+ prefetchLayout = LayoutInfo(anchorLayout);
+ } else {
+ // Here we assign the default layout to the tensor descriptor operand of
+ // prefetch.
+ auto tdescTy = prefetch.getTensorDescType();
+
+ auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+ uArch->getInstruction(
+ xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+ auto blockWHC =
+ uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+ if (!blockWHC)
+ prefetch.emitWarning("No known block params found for the element type.");
+ auto [bWidth, bHeight, bCount] = blockWHC.value();
+ SmallVector<int> instData;
+ int instWidth = xegpu::getLargestDivisor(
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+ bCount);
+ if (instWidth == -1)
prefetch.emitWarning(
"No suitable instruction multiple found for the given shape.");
- instData = {instHeight, instWidth};
- }
- LayoutInfo prefetchLayout;
- if (layoutKind == LayoutKind::InstData)
- prefetchLayout =
- LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
- else
- prefetchLayout = getDefaultSIMTLayoutInfo(
- tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
+ if (tdescTy.getRank() == 1)
+ instData = {instWidth};
+ else {
+ int instHeight = xegpu::getLargestDivisor(
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+ if (instHeight == -1)
+ prefetch.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ instData = {instHeight, instWidth};
+ }
+
+ if (layoutKind == LayoutKind::InstData)
+ prefetchLayout =
+ LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
+ else
+ prefetchLayout = getDefaultSIMTLayoutInfo(
+ tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
+ prefetch.setAnchorLayoutAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(prefetchLayout.get()));
+ }
// Propagate the layout to the source tensor descriptor.
propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
}
@@ -617,69 +642,96 @@ void LayoutInfoPropagation::visitUpdateNdOffsetOp(
void LayoutInfoPropagation::visitDpasOp(
xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- VectorType aTy = dpas.getLhsType();
- VectorType bTy = dpas.getRhsType();
-
- auto uArch = getUArch(getChipStr(dpas).value_or(""));
- const int subgroupSize = uArch->getSubgroupSize();
- const auto *uArchInstruction =
- dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
- xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
-
- const unsigned dataALen = aTy.getShape().front();
- auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
- const int maxALen =
- xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
- if (maxALen == -1)
- dpas.emitWarning(
- "No suitable instruction multiple found for the given shape.");
-
- const unsigned dataBLen = bTy.getShape().back();
- auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
- const int maxBLen =
- xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
- if (maxBLen == -1)
- dpas.emitWarning(
- "No suitable instruction multiple found for the given shape.");
- SmallVector<int> instDataA = {maxALen, subgroupSize};
- SmallVector<int> instDataB = {subgroupSize, maxBLen};
LayoutInfo dpasALayout;
LayoutInfo dpasBLayout;
LayoutInfo dpasCLayout;
- if (layoutKind == LayoutKind::InstData) {
- dpasALayout =
- LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA));
- dpasBLayout =
- LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB));
+ xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr();
+ if (hasAnchorLayout(anchorLayoutC)) {
+ xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr();
+ xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr();
+ assert(hasAnchorLayout(anchorLayoutA) &&
+ "Expected anchor layout for DPAS A operand.");
+ assert(hasAnchorLayout(anchorLayoutB) &&
+ "Expected anchor layout for DPAS B operand.");
+ dpasALayout = LayoutInfo(anchorLayoutA);
+ dpasBLayout = LayoutInfo(anchorLayoutB);
+ dpasCLayout = LayoutInfo(anchorLayoutC);
+
} else {
- dpasALayout = getSIMTLayoutInfoForDPASOperand(
- aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA());
- dpasBLayout = getSIMTLayoutInfoForDPASOperand(
- bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB());
- }
- propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
- propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
- if (operands.size() > 2) {
- VectorType cTy = dpas.getAccType();
- const unsigned dataCLen = bTy.getShape().back();
- auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
- const int maxCLen =
- xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
- if (maxCLen == -1)
+ VectorType aTy = dpas.getLhsType();
+ VectorType bTy = dpas.getRhsType();
+
+ auto uArch = getUArch(getChipStr(dpas).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+ xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+ const unsigned dataALen = aTy.getShape().front();
+ auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
+ const int maxALen =
+ xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+ if (maxALen == -1)
dpas.emitWarning(
"No suitable instruction multiple found for the given shape.");
- SmallVector<int> instDataC = {maxALen, maxCLen};
- if (layoutKind == LayoutKind::InstData)
- dpasCLayout =
- LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
- else
- dpasCLayout = getSIMTLayoutInfoForDPASOperand(
- cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
+ const unsigned dataBLen = bTy.getShape().back();
+ auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType());
+
+ const int maxBLen =
+ xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+
+ if (maxBLen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ SmallVector<int> instDataA = {maxALen, subgroupSize};
+ SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+ if (layoutKind == LayoutKind::InstData) {
+ dpasALayout =
+ LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA));
+ dpasBLayout =
+ LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB));
+ } else {
+ dpasALayout = getSIMTLayoutInfoForDPASOperand(
+ aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA());
+ dpasBLayout = getSIMTLayoutInfoForDPASOperand(
+ bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB());
+ }
+ if (operands.size() > 2) {
+ VectorType cTy = dpas.getAccType();
+ if (layoutKind == LayoutKind::InstData) {
+ const unsigned dataCLen = bTy.getShape().back();
+ auto supportedCLen =
+ uArchInstruction->getSupportedN(bTy.getElementType());
+ const int maxCLen = xegpu::getLargestDivisor(
+ dataCLen, ArrayRef<unsigned>(supportedCLen));
+ if (maxCLen == -1)
+ dpas.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ SmallVector<int> instDataC = {maxALen, maxCLen};
+ dpasCLayout =
+ LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
+ } else
+ dpasCLayout = getSIMTLayoutInfoForDPASOperand(
+ cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
+
+ dpas.setAnchorLayoutCdAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(dpasCLayout.get()));
+ }
+ dpas.setAnchorLayoutAAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(dpasALayout.get()));
+ dpas.setAnchorLayoutBAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(dpasBLayout.get()));
+ }
+
+ propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
+ propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
+ if (operands.size() > 2) {
propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout));
}
}
@@ -689,43 +741,51 @@ void LayoutInfoPropagation::visitStoreNdOp(
xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- auto uArch = getUArch(getChipStr(store).value_or(""));
- const auto *uArchInstruction =
- dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
- uArch->getInstruction(
- xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
- VectorType dataTy = store.getValueType();
- auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
- store.getValueType().getElementType());
- if (!blockWHC)
- store.emitWarning("No known block params found for the element type.");
- auto [bWidth, bHeight, bCount] = blockWHC.value();
- SmallVector<int> instData;
- int instWidth = xegpu::getLargestDivisor(
- static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
- bCount);
- if (instWidth == -1)
- store.emitWarning(
- "No suitable instruction multiple found for the given shape.");
- if (dataTy.getRank() == 1)
- instData = {instWidth};
- else {
- int instHeight = xegpu::getLargestDivisor(
- static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
- if (instHeight == -1)
+ LayoutInfo storeLayout;
+ xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr();
+ if (hasAnchorLayout(anchorLayout)) {
+ storeLayout = LayoutInfo(anchorLayout);
+ } else {
+ auto uArch = getUArch(getChipStr(store).value_or(""));
+ const auto *uArchInstruction =
+ dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+ uArch->getInstruction(
+ xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+ VectorType dataTy = store.getValueType();
+ auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+ store.getValueType().getElementType());
+ if (!blockWHC)
+ store.emitWarning("No known block params found for the element type.");
+ auto [bWidth, bHeight, bCount] = blockWHC.value();
+ SmallVector<int> instData;
+ int instWidth = xegpu::getLargestDivisor(
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+ bCount);
+ if (instWidth == -1)
store.emitWarning(
"No suitable instruction multiple found for the given shape.");
- instData = {instHeight, instWidth};
- }
+ if (dataTy.getRank() == 1)
+ instData = {instWidth};
+ else {
+ int instHeight = xegpu::getLargestDivisor(
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+ if (instHeight == -1)
+ store.emitWarning(
+ "No suitable instruction multiple found for the given shape.");
+ instData = {instHeight, instWidth};
+ }
- LayoutInfo storeLayout;
- if (layoutKind == LayoutKind::InstData)
- storeLayout =
- LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
- else
- storeLayout =
- getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
- uArchInstruction->getPackedFormatBitSize());
+ if (layoutKind == LayoutKind::InstData)
+ storeLayout =
+ LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
+ else
+ storeLayout =
+ getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
+ uArchInstruction->getPackedFormatBitSize());
+ store.setAnchorLayoutAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
+ }
+ // Propagate the layout to the value operand.
// Both operands should have the same layout
for (LayoutInfoLattice *operand : operands)
propagateIfChanged(operand, operand->meet(storeLayout));
@@ -736,21 +796,31 @@ void LayoutInfoPropagation::visitStoreNdOp(
void LayoutInfoPropagation::visitLoadNdOp(
xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo valueLayout = results[0]->getValue();
- // Need the layout of the value to propagate to the tensor descriptor.
- if (!valueLayout.isAssigned())
- return;
- LayoutInfo tensorDescLayout = valueLayout;
- // LoadNdOp has the transpose effect. However, at the stage of this analysis
- // this effect is not expected and should be abstracted away. Emit a
- // warning.
- if (auto transpose = load.getTranspose()) {
- load.emitWarning("Transpose effect is not expected for LoadNdOp at "
- "LayoutInfoPropagation stage.");
- tensorDescLayout = valueLayout.transpose(transpose.value());
+
+ LayoutInfo loadLayout;
+ xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
+ if (hasAnchorLayout(anchorLayout)) {
+ loadLayout = LayoutInfo(anchorLayout);
+ } else {
+
+ LayoutInfo valueLayout = results[0]->getValue();
+ // Need the layout of the value to propagate to the tensor descriptor.
+ if (!valueLayout.isAssigned())
+ return;
+ loadLayout = valueLayout;
+ // LoadNdOp has the transpose effect. However, at the stage of this analysis
+ // this effect is not expected and should be abstracted away. Emit a
+ // warning.
+ if (auto transpose = load.getTranspose()) {
+ load.emitWarning("Transpose effect is not expected for LoadNdOp at "
+ "LayoutInfoPropagation stage.");
+ loadLayout = valueLayout.transpose(transpose.value());
+ }
+ load.setAnchorLayoutAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
}
// Propagate the new layout to the tensor descriptor operand.
- propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
+ propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
}
/// For vector::TransposeOp, the layout of the result is transposed and
@@ -840,37 +910,49 @@ void LayoutInfoPropagation::visitVectorBitcastOp(
void LayoutInfoPropagation::visitLoadGatherOp(
xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- // The layout is strictly determined by the payload type.
- auto payloadTy = dyn_cast<VectorType>(load.getValueType());
- if (!payloadTy) {
- load.emitWarning("Not propagating, non-vector payload supplied.");
- return;
- }
- auto uArch = getUArch(getChipStr(load).value_or(""));
- const int subgroupSize = uArch->getSubgroupSize();
- SmallVector<int> instData{subgroupSize};
- if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
- instData.push_back(chunkSize);
- else if (auto srcTdescTy =
- dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
- if (srcTdescTy.getChunkSizeAsInt() > 1)
+
+ LayoutInfo loadLayout;
+ LayoutInfo maskLayout;
+ xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
+ if (hasAnchorLayout(anchorLayout)) {
+ loadLayout = LayoutInfo(anchorLayout);
+ maskLayout = loadLayout;
+ } else {
+
+ // The layout is strictly determined by the payload type.
+ auto payloadTy = dyn_cast<VectorType>(load.getValueType());
+ if (!payloadTy) {
+ load.emitWarning("Not propagating, non-vector payload supplied.");
+ return;
+ }
+ auto uArch = getUArch(getChipStr(load).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+ SmallVector<int> instData{subgroupSize};
+ if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
instData.push_back(chunkSize);
- }
- LayoutInfo layout;
- if (layoutKind == LayoutKind::InstData)
- layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData));
- else
- layout = getDefaultSIMTLayoutInfo(payloadTy, uArch,
- uArch->getGeneralPackedFormatBitSize(),
- /*scattered*/ true);
-
- // Mask operand should have 1D default layout.
- LayoutInfo maskLayout =
- getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
+ else if (auto srcTdescTy =
+ dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+ if (srcTdescTy.getChunkSizeAsInt() > 1)
+ instData.push_back(chunkSize);
+ }
+
+ if (layoutKind == LayoutKind::InstData)
+ loadLayout =
+ LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData));
+ else
+ loadLayout = getDefaultSIMTLayoutInfo(
+ payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(),
+ /*scattered*/ true);
+
+ // Mask operand should have 1D default layout.
+ maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
+ load.setAnchorLayoutAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
+ }
// Propagate the new layout to the tensor descriptor operand.
if (isa<xegpu::TensorDescType>(load.getSourceType()))
- propagateIfChanged(operands[0], operands[0]->meet(layout));
+ propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
// Propagate the new layout to the mask and optional offset operand.
propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
if (load.getOffsets())
@@ -898,21 +980,26 @@ void LayoutInfoPropagation::visitCreateDescOp(
void LayoutInfoPropagation::visitStoreScatterOp(
xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
- // Currently, for 2D StoreScatterOp we expect that the height dimension of
- // the tensor descriptor is equal to the subgroup size. This is ensured by
- // the op verifier.
- auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
- if (!payloadTy) {
- storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
- return;
- }
- LayoutInfo payloadLayout;
- auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
- const int subgroupSize = uArch->getSubgroupSize();
- if (auto layout = storeScatter.getLayoutAttr()) {
- payloadLayout = LayoutInfo(layout);
+ LayoutInfo payloadLayout;
+ LayoutInfo maskLayout;
+ xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr();
+ if (hasAnchorLayout(anchorLayout)) {
+ payloadLayout = LayoutInfo(anchorLayout);
+ maskLayout = payloadLayout;
} else {
+ // Currently, for 2D StoreScatterOp we expect that the height dimension of
+ // the tensor descriptor is equal to the subgroup size. This is ensured by
+ // the op verifier.
+ auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
+ if (!payloadTy) {
+ storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
+ return;
+ }
+
+ auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+ const int subgroupSize = uArch->getSubgroupSize();
+
if (layoutKind == LayoutKind::InstData) {
SmallVector<int> instData{subgroupSize};
if (auto chunkSize = storeScatter.getChunkSize().value_or(0);
@@ -936,10 +1023,13 @@ void LayoutInfoPropagation::visitStoreScatterOp(
payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(),
/*scattered=*/true);
}
- }
- LayoutInfo maskLayout =
- getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
+ maskLayout =
+ getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
+
+ storeScatter.setAnchorLayoutAttr(
+ dyn_cast<xegpu::DistributeLayoutAttr>(payloadLayout.get()));
+ }
// Propagate the payload operand layout
propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
// Propagate the destination (if tdesc) operand layout
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index b0b748c3409c3..c644f784606e9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset
pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
}
- auto layout = op.getLayoutAttr();
+ auto layout = op.getAnchorLayoutAttr();
if (layout)
layout = layout.dropInstData();
@@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets
SmallVector<Value> convertedValues =
pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
- auto layout = op.getLayoutAttr();
+ auto layout = op.getAnchorLayoutAttr();
if (layout)
layout = layout.dropInstData();
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 4fe35a16b3994..572e5442760bc 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -147,7 +147,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
// check for "permament" layout only after "temporary" layout name lookup
// for backward compatibility
if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
- return loadGatherOp.getLayoutAttr();
+ return loadGatherOp.getAnchorLayoutAttr();
}
if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -178,7 +178,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
// check for "permament" layout only after "temporary" layout name lookup
if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
- if (auto layout = storeScatterOp.getLayoutAttr())
+ if (auto layout = storeScatterOp.getAnchorLayoutAttr())
return layout;
return getDistributeLayoutAttr(opr.get());
@@ -193,7 +193,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
xegpu::DistributeLayoutAttr candidate = layout;
if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
- if (auto perm = loadOp.getLayoutAttr())
+ if (auto perm = loadOp.getAnchorLayoutAttr())
candidate = perm;
}
@@ -211,7 +211,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
if (idx == 0) {
- if (auto perm = storeOp.getLayoutAttr())
+ if (auto perm = storeOp.getAnchorLayoutAttr())
candidate = perm;
}
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index c31ef323a94d2..62a33a4797d2b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -5,14 +5,14 @@
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<inst_data = [8, 16]>, anchor_layout_b = #xegpu.layout<inst_data = [16, 16]>, anchor_layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.module @test {
func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
@@ -46,7 +46,7 @@ gpu.module @test_kernel {
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+ //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
%a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
%b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
@@ -85,7 +85,7 @@ gpu.module @test_kernel {
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+ //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [4, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
//CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
%a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
%b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
@@ -113,9 +113,9 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}>
// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
%offset = arith.constant dense<12> : vector<16xindex>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index eb004932af4be..d1bee47dd6d37 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -6,14 +6,14 @@ gpu.module @test {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -32,7 +32,8 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
gpu.module @test {
// CHECK-LABEL: func.func @dpas_i8(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
-// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
+// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+
func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
@@ -46,8 +47,8 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
gpu.module @test {
// CHECK-LABEL: func.func @load_with_transpose_effect(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> -> vector<16x16xf16>
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -108,7 +109,7 @@ gpu.module @test {
// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
@@ -135,7 +136,7 @@ gpu.module @test {
// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -183,9 +184,9 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}>
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
%offset = arith.constant dense<12> : vector<16xindex>
@@ -204,7 +205,7 @@ gpu.module @test {
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
%offset = arith.constant dense<12> : vector<16xindex>
@@ -220,10 +221,10 @@ gpu.module @test {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16>
// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-SAME <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
%offset = arith.constant dense<12> : vector<16xindex>
@@ -239,11 +240,11 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16>
// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-SAME <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
%offset = arith.constant dense<12> : vector<16xindex>
@@ -256,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
-// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
-// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16>
// CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x16xi16> to vector<8x16xf16>
@@ -281,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: vector<16x8xi32> to vector<16x16xf16>
@@ -302,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
// CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x32xi16> to vector<8x16xi32>
@@ -339,9 +340,9 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
@@ -362,9 +363,9 @@ gpu.module @test {
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -385,11 +386,11 @@ gpu.module @test {
// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -397,7 +398,7 @@ gpu.module @test {
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>
// CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
@@ -425,11 +426,11 @@ gpu.module @test {
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16>
// CHECK-NEXT: } else {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16>
// CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
@@ -455,11 +456,11 @@ gpu.module @test {
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16>
// CHECK-NEXT: } else {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16>
// CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -539,7 +540,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @prefetch_2d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @prefetch_2d(%arg0: memref<256x256xf16>){
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
@@ -552,7 +553,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @prefetch_1d(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
func.func @prefetch_1d(%arg0: memref<256xf16>){
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
@@ -599,7 +600,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -621,7 +622,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 3760737cf51f5..171cadeeaeaf9 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -286,7 +286,7 @@ gpu.module @test_distribution {
// CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
- // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
+ // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{anchor_layout = #xegpu.layout<inst_data = [8]>, chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
// CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
// CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
@@ -554,7 +554,7 @@ gpu.module @test_distribution {
%offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
- // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
+ // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{anchor_layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>, chunk_size = 1 : i64}>
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
// CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
%3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
>From 0482234e56256ac0824a4fb85bac492b50080fdc Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 24 Nov 2025 01:58:41 +0000
Subject: [PATCH 3/3] adding documentation
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 192 ++++++++++++++----
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 20 +-
3 files changed, 165 insertions(+), 49 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 70c61a445e8ae..344fb23ba7b8d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -253,6 +253,22 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
It issues an instruction to prefetch a block of data from continuous
memory regions to each level of the cache based on their cache policy.
+ Arguments:
+ - `TensorDesc`: A tensor descriptor specifying the base nd-region of
+ memory and tensor tile to be prefetched.
+
+ - `offsets`: index values representing per-dimension offsets from the
+ base position encoded in `TensorDesc`. It is encoded via "offsets"
+ and "const_offsets".
+
+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] An cache-hint attribute
+ indicating the desired behavior at the L1, L2, and L3 cache levels.
+
+ - `anchor_layout`: [optional] An attribute that identifies the operation
+ as an anchor, enabling users to assign a layout that governs distribution
+ at the subgroup and/or work-item level. Only valid at workgroup and subgroup
+ level.
+
Example:
```mlir
xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -326,16 +342,37 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
a block of data from memory to register. It takes a set of optional cache
hints for each level of cache, L1, L2 and L3. If hardware does not have a
correspoding cache, Corresponding cache hint attribute will be masked.
- VNNI transformation is an hardware feature for Intel GPU, which is used to
- do data packing during the load for B operand of matrix operation, if
- the bit width of the data type is less then 32 bits, e.g., fp16. And
- transpose is another Intel hardware feature, which will do transpose
- operation when loading the data if the bit width of the data type is
- fp32 or fp64. It implies that vnni and transpose cannot exit at the
- same time. It is only available to 1D or 2D blocked tensor_desc.
+
+ On Intel GPUs, hardware-supported packing rearranges data elements during
+ the load of the B operand when the element bit-width is less than 32 bits
+ (for example, fp16). The transpose feature reorders data during the load
+ when the element type is fp32 or fp64. These two features are mutually
+ exclusive and shall not be enabled simultaneously. Both features support only
+ 2D blocked tensor_desc.
In SIMT mode, result vector represents the data to be loaded by each work-item.
+ Arguments:
+
+ - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory
+ and the tensor tile to be loaded.
+
+ - `offsets`: Index values representing per-dimension offsets from the base position
+ encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`.
+
+ - `packed`: [optional] A unit attribute indicating that packing is applied
+ during the load when supported by the hardware. Only valid at lane level.
+
+ - `transpose`: [optional] An attribute describing a hardware-supported transpose
+ to be applied during the load. Only valid at Lane level.
+
+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
+ desired behavior at the L1, L2, and L3 cache levels.
+
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+ enabling users to assign a layout that governs distribution at the subgroup and/or
+ work-item level. Only valid at workgroup and subgroup levels.
+
Example 1:
```mlir
xegpu.load_nd %1 {transpose = [1, 0],
@@ -391,7 +428,6 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
return getTensorDescType().getShape();
}
-
}];
let assemblyFormat = [{
@@ -432,6 +468,23 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
In SIMT mode, the input vector represents the data to be stored by each work-item.
+ Arguments:
+
+ - `value`: A vector value representing the tensor tile to be stored.
+
+ - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory and
+ the tensor tile to be stored.
+
+ - `offsets`: Index values representing per-dimension offsets from the base position
+ encoded in `TensorDesc`. They are encoded via `offsets` and `const_offsets`.
+
+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] Cache-hint attributes indicating the
+ desired behavior at the L1, L2, and L3 cache levels.
+
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+ enabling users to assign a layout that governs distribution at the subgroup and/or
+ work-item level. Only valid at workgroup and subgroup levels.
+
Example 1:
```mlir
xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
@@ -568,8 +621,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
It accepts the following parameters:
Arguments:
+
- `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
memory object.
+
- `offsets`: a vector containing offsets of each access point. Its size
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
implying each element in the vector corresponds to a work-item (SIMT lane)
@@ -668,17 +723,25 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
it works on scattered TensorDesc instead.
Arguments:
+
- `source`: represents the memory region to be loaded from, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
In case of tensor_desc, offsets come from the producer create_tdesc op.
tensor_desc cannot be used in SIMT mode.
+
- `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
- - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
- - `offset_align_byte`: required if `source` is a pointer. If `source` is not a pointer,
+
+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
+
+ - `offset_align_byte`: [optional] required if `source` is a pointer. If `source` is not a pointer,
it is not allowed. Represents the alignment in bytes of each offset in offsets.
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+ enabling users to assign a layout that governs distribution at the subgroup and/or
+ work-item level. Only valid at workgroup and subgroup levels.
+
Example 1:
```mlir
xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -727,7 +790,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
- OptionalAttr<I64Attr>:$offset_align_byte);
+ OptionalAttr<I64Attr>:$offset_align_byte,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let extraClassDeclaration = extraBaseClassDeclaration # [{
Type getSourceType() {
@@ -779,18 +843,27 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
each work-item. If size is not 1, size should be equal to the chunk size,
Arguments:
+
- `source`: represents the memory region to be loaded from, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
In case of tensor_desc, offsets come from the producer create_tdesc op.
tensor_desc cannot be used in SIMT mode.
+
- `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+
- `mask`: is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
scalar mask is also valid for SIMT mode.
- - `chunk_size`: (optional) represents contiguous number of elements to load from per work item.
- - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
+
+ - `chunk_size`: [optional] represents contiguous number of elements to load from per work item.
+
+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
+
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+ enabling users to assign a layout that governs distribution at the subgroup and/or
+ work-item level. Only valid at workgroup and subgroup levels.
Results:
- `res`: represents loaded data
@@ -926,19 +999,30 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
each work-item. If size is not 1, size should be equal to the chunk size.
Arguments:
+
- `value`: represents the data to be stored.
+
- `dest`: represents the memory region to be stored to, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
In case of tensor_desc, offsets come from the producer create_tdesc op.
tensor_desc cannot be used in SIMT mode.
+
- `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+
- `mask`: is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
scalar mask is also valid for SIMT mode.
- - `chunk_size`: (optional) represents contiguous number of elements to store to per work item.
- - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
+
+ - `chunk_size`: [optional] represents contiguous number of elements to store to per work item.
+
+ - `l1_hint`, `l2_hint`, `l3_hint`: [optional] cache hints for each level of cache.
+
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+ enabling users to assign a layout that governs distribution at the subgroup and/or
+ work-item level. Only valid at workgroup and subgroup levels.
+
Example 1:
```mlir
@@ -1115,22 +1199,28 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
- and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
- also requires A and B to be loaded with the required data layout. Specially,
- VNNI layout is required for B operand. It is achieved via adding `packed`
- attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
- can be represented as a 3D vector, with the last dimension representing the VNNI
- factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
- can be represented as `B: vector<8x16x2xf16>`.
+ and `C/D: vector<8x16xf32>`.
In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
(https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
for more details about the fragment distribution.
- Note: on PVC, the hardware can perform load with VNNI transformation when data
- element type is 16-bit or lower precision, taking 2 or 4 elements from
- the first dimension and inserted into the newly added innermost dimension.
+ Arguments:
+
+ - `lhs`: A vector value representing the left-hand-side matrix tile (A) participating in the
+ matrix multiply.
+
+ - `rhs`: A vector value representing the right-hand-side matrix tile (B).
+
+ - `acc`: [optional] A vector value representing the accumulator matrix tile (C). When present, the
+ result is computed as `lhs * rhs + acc`; otherwise, the accumulator is implicitly assumed to be zero.
+
+ - `anchor_layout_a`, `anchor_layout_b`, `anchor_layout_cd`: [optional] Attributes that identify this
+ operation as anchors for operands A, B, and the accumulator/result, enabling users to assign layouts
+ that govern distribution at the subgroup and/or work-item level. Only valid at workgroup and subgroup
+ level.
+
}];
let arguments = (ins
@@ -1187,13 +1277,31 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
has the same shape with `TensorDesc`, and is used to enable or disable specific
data points of the `TensorDesc`. The `value` operand represents the new value to
be applied during the modification.
+ Arguments:
+ - `kind`: An attribute that specifies the atomic operation to be performed
+ (e.g., add, min, max, exchange, etc.).
+
+ - `tensorDesc`: A `TensorDesc` describing the memory region on which the atomic
+ read-modify-write is performed.
+
+ - `mask`: A predicate mask with the same shape as `tensorDesc`. Only elements
+ with a true (non-zero) mask value participate in the atomic operation;
+ masked-out elements are not modified.
+
+ - `value`: The input values used by the atomic operation. It must have the same
+ shape and element type as `tensorDesc` and `result`.
+
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor,
+ enabling users to assign a layout that governs distribution at the subgroup
+ and/or work-item level. Only valid at workgroup and subgroup levels.
}];
let arguments = (ins
AtomicRMWKindAttr:$kind,
XeGPU_TensorDesc:$tensorDesc,
XeGPU_MaskType:$mask,
- XeGPU_ValueType:$value);
+ XeGPU_ValueType:$value,
+ OptionalAttr<DistributeLayoutAttr>:$anchor_layout);
let results = (outs XeGPU_ValueType:$result);
@@ -1275,6 +1383,13 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
the IR is lowered to WI level because that is the end result of all distributions.
+ Arguments:
+ - `source`: The input vector whose data is to be redistributed. The source and
+ result types must match.
+ - `input_layout`: The layout attribute describing the current distribution of `source`
+ across subgroups and/or work-items.
+ - `target_layout`: The layout attribute describing the desired distribution of the result
+ across subgroups and/or work-items.
}];
let arguments = (ins XeGPU_VectorType: $source,
DistributeLayoutAttr: $input_layout,
@@ -1342,12 +1457,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
Arguments:
- `mem_desc`: the memory descriptor identifying the SLM region.
- `offsets`: the coordinates within the matrix to read from.
- - `subgroup_block_io`: [optional] An attribute indicating that the operation can be
- lowered to a subgroup block load. When this attribute is present,
- the offsets are subgroup-uniform across all lanes.
- - `anchor_layout`: [optional] An attribute for guiding distributions among
- subgroups and/or work-items. It currently can accept either
- LayoutAttr or SliceAttr.
+ - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
+ to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
+ across all lanes. Only used on subgroup and lane level.
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
+ users to assign a layout that governs distribution at the subgroup and/or work-item level.
+ Only valid at workgroup and subgroup levels.
+
Results:
- `res`: the matrix elements loaded from SLM.
}];
@@ -1393,12 +1509,12 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
- `mem_desc`: the memory descriptor specifying the SLM region.
- `offsets`: the coordinates within the matrix where the data will be written.
- `data`: the values to be stored in the matrix.
- - `subgroup_block_io`: [optional] An attribute indicating that the operation can be
- lowered to a subgroup block store. When this attribute is present,
- the offsets are subgroup-uniform across all lanes.
- - `anchor_layout`: [optional] An attribute for guiding distributions among
- subgroups and/or work-items. It currently can accept either
- LayoutAttr or SliceAttr.
+ - `subgroup_block_io`: [optional] An attribute indicating that the operation can be lowered
+ to a subgroup block load. When this attribute is present, the offsets are subgroup-uniform
+ across all lanes. Only used on subgroup and lane level.
+ - `anchor_layout`: [optional] An attribute that identifies the operation as an anchor, enabling
+ users to assign a layout that governs distribution at the subgroup and/or work-item level.
+ Only valid at workgroup and subgroup levels.
}];
let builders = [
OpBuilder<(ins "Value" : $data, "TypedValue<MemDescType>": $mem_desc,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3240c0f40ce58..29daab384bf7f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -828,7 +828,7 @@ void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value source,
xegpu::CachePolicyAttr l2_hint,
xegpu::CachePolicyAttr l3_hint) {
build(builder, state, source, Value(), l1_hint, l2_hint, l3_hint,
- IntegerAttr{});
+ IntegerAttr{}, /*anchor_layout=*/nullptr);
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 6d45a51ab0267..3b5207dd92285 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -387,7 +387,7 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
- bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout);
+ bool hasParamsOfLayoutKind(xegpu::DistributeLayoutAttr anchorLayout);
public:
LayoutInfoPropagation(DataFlowSolver &solver,
@@ -477,7 +477,7 @@ LogicalResult LayoutInfoPropagation::visitOperation(
return success();
}
-bool LayoutInfoPropagation::hasAnchorLayout(
+bool LayoutInfoPropagation::hasParamsOfLayoutKind(
xegpu::DistributeLayoutAttr anchorLayout) {
if (anchorLayout == nullptr) {
return false;
@@ -497,7 +497,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
LayoutInfo prefetchLayout;
xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr();
- if (hasAnchorLayout(anchorLayout)) {
+ if (hasParamsOfLayoutKind(anchorLayout)) {
prefetchLayout = LayoutInfo(anchorLayout);
} else {
// Here we assign the default layout to the tensor descriptor operand of
@@ -648,12 +648,12 @@ void LayoutInfoPropagation::visitDpasOp(
LayoutInfo dpasCLayout;
xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr();
- if (hasAnchorLayout(anchorLayoutC)) {
+ if (hasParamsOfLayoutKind(anchorLayoutC)) {
xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr();
xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr();
- assert(hasAnchorLayout(anchorLayoutA) &&
+ assert(hasParamsOfLayoutKind(anchorLayoutA) &&
"Expected anchor layout for DPAS A operand.");
- assert(hasAnchorLayout(anchorLayoutB) &&
+ assert(hasParamsOfLayoutKind(anchorLayoutB) &&
"Expected anchor layout for DPAS B operand.");
dpasALayout = LayoutInfo(anchorLayoutA);
dpasBLayout = LayoutInfo(anchorLayoutB);
@@ -743,7 +743,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
LayoutInfo storeLayout;
xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr();
- if (hasAnchorLayout(anchorLayout)) {
+ if (hasParamsOfLayoutKind(anchorLayout)) {
storeLayout = LayoutInfo(anchorLayout);
} else {
auto uArch = getUArch(getChipStr(store).value_or(""));
@@ -799,7 +799,7 @@ void LayoutInfoPropagation::visitLoadNdOp(
LayoutInfo loadLayout;
xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
- if (hasAnchorLayout(anchorLayout)) {
+ if (hasParamsOfLayoutKind(anchorLayout)) {
loadLayout = LayoutInfo(anchorLayout);
} else {
@@ -914,7 +914,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
LayoutInfo loadLayout;
LayoutInfo maskLayout;
xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr();
- if (hasAnchorLayout(anchorLayout)) {
+ if (hasParamsOfLayoutKind(anchorLayout)) {
loadLayout = LayoutInfo(anchorLayout);
maskLayout = loadLayout;
} else {
@@ -984,7 +984,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
LayoutInfo payloadLayout;
LayoutInfo maskLayout;
xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr();
- if (hasAnchorLayout(anchorLayout)) {
+ if (hasParamsOfLayoutKind(anchorLayout)) {
payloadLayout = LayoutInfo(anchorLayout);
maskLayout = payloadLayout;
} else {
More information about the Mlir-commits
mailing list