[Mlir-commits] [mlir] 2b9e477 - [MLIR][XeGPU] Refactor Layout access interface (#172125)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Dec 17 12:05:03 PST 2025
Author: Jianhui Li
Date: 2025-12-17T12:04:58-08:00
New Revision: 2b9e47749ca1eb337ea26b8084dad52732dc7186
URL: https://github.com/llvm/llvm-project/commit/2b9e47749ca1eb337ea26b8084dad52732dc7186
DIFF: https://github.com/llvm/llvm-project/commit/2b9e47749ca1eb337ea26b8084dad52732dc7186.diff
LOG: [MLIR][XeGPU] Refactor Layout access interface (#172125)
This PR builds on the anchor layout mechanism introduced in
https://github.com/llvm/llvm-project/pull/169267 and performs the
following refactoring:
1. Introduce getAnchorLayout() and setAnchorLayout() interface for
anchor ops to get and set layout attributes.
2. Add getLocalLayout() and setLocalLayout() utility functions, and
refactor workgroup/subgroup distribution patterns to use these APIs.
These utilities access the layout information directly and locally,
without relying on global propagation.
3. Introduce localPropagateLayoutsFromAnchor(), a utility used by
subgroup distribution to unify non-anchor layout setup.
This function is intended to be invoked upfront by all layout-based
passes (including workgroup/subgroup distribution and unrolling) to
propagate layouts from anchor ops to non-anchor ops.
After this step, patterns within the pass should exclusively use
getLocalLayout() / setLocalLayout().
4. Refactor getDistributeLayoutAttr() and setDistributeLayoutAttr() to
remove special-case handling. These APIs now operate in a uniform order:
anchor ops first, then non-anchor ops, and finally block arguments.
These APIs will be deprecated on long run.
5. Refactor patterns in wg/sg distribution, load optimization passes to
use get/setAnchorLayout() and get/setLocalLayout().
6. Update test cases to enforce that anchor ops must use—and only
use—anchor layouts.
Added:
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/Dialect/XeGPU/optimize-transpose.mlir
mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
mlir/test/Dialect/XeGPU/propagate-layout.mlir
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
mlir/test/Dialect/XeGPU/transform-ops.mlir
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
index efca3cfa0dab7..b10219f71b531 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt
@@ -15,3 +15,8 @@ set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
mlir_tablegen(XeGPUAttrInterface.h.inc -gen-attr-interface-decls)
mlir_tablegen(XeGPUAttrInterface.cpp.inc -gen-attr-interface-defs)
add_mlir_dialect_tablegen_target(MLIRXeGPUAttrInterfaceIncGen)
+
+set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
+mlir_tablegen(XeGPUOpInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(XeGPUOpInterface.cpp.inc -gen-op-interface-defs)
+add_mlir_dialect_tablegen_target(MLIRXeGPUOpInterfaceIncGen)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 0c059967bb898..7badfaf4a8216 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -34,6 +34,7 @@ class SliceAttr;
#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
#include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
+#include <mlir/Dialect/XeGPU/IR/XeGPUOpInterface.h.inc>
// clang-format on
#define GET_ATTRDEF_CLASSES
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index eae0bd4e68a84..446f64fffa468 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -770,4 +770,30 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> {
}
+def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> {
+ let cppNamespace = "::mlir::xegpu";
+
+ let description = [{
+ An attribute interface for accessing anchor layout information.
+ This interface provides a method to set and retrieve the anchor layout
+ from operations that implement it.
+ }];
+
+ let methods = [
+ InterfaceMethod<
+ /*desc=*/"Get the anchor layout attribute.",
+ /*retTy=*/"xegpu::DistributeLayoutAttr",
+ /*methodName=*/"getAnchorLayout",
+ /*args=*/(ins)
+ >,
+ InterfaceMethod<
+ /*desc=*/"Set the anchor layout attribute.",
+ /*retTy=*/"void",
+ /*methodName=*/"setAnchorLayout",
+ /*args=*/(ins "xegpu::DistributeLayoutAttr":$layout)
+ >,
+ ];
+
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index b54d620c3c0c3..92ac8870b7068 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -17,6 +17,7 @@ include "mlir/Interfaces/ShapedOpInterfaces.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Interfaces/ViewLikeInterface.td"
+
// Base class for dialect operations. This operation inherits from the base
// `Op` class in OpBase.td, and provides:
// * The parent dialect of the operation.
@@ -247,7 +248,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
}];
}
-def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> {
let summary = "prefetches a n-D block to cache";
let description = [{
It issues an instruction to prefetch a block of data from continuous
@@ -296,6 +297,14 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
return getTensorDesc().getType();
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
SmallVector<OpFoldResult> getMixedOffsets() {
auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
auto dynamics = getOffsets();
@@ -338,7 +347,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
- AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>
+ AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>, AnchorLayoutInterface
]> {
let summary = "loads a n-D block from memory (represented by TensorDesc)"
"to registers (represented by vector)";
@@ -419,6 +428,14 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
return getTensorDesc().getType();
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
SmallVector<OpFoldResult> getMixedOffsets() {
auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
auto dynamics = getOffsets();
@@ -462,7 +479,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
}
def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
- AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>
+ AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>, AnchorLayoutInterface
]> {
let summary = "stores a n-D block register region back to memory, currently only supports 2D";
@@ -532,6 +549,14 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
return getTensorDesc().getType();
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
SmallVector<OpFoldResult> getMixedOffsets() {
auto statics = getConstOffsets().value_or(SmallVector<int64_t>());
auto dynamics = getOffsets();
@@ -724,7 +749,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
let hasVerifier = 1;
}
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
let summary = "prefetches a set of scattered data points to cache";
let description = [{
@@ -813,6 +838,14 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
return getSource().getType();
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
TypedValue<xegpu::TensorDescType> getTensorDesc() {
if (auto tdescType = getTensorDescType()) {
return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
@@ -843,7 +876,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
let hasVerifier = 1;
}
-def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayoutInterface]> {
let summary = "load a set of scattered data points from memory.";
let description = [{ It (aka. load) load data per each lane. The output
@@ -949,6 +982,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
return getSource().getType();
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
TypedValue<xegpu::TensorDescType> getTensorDesc() {
if (auto tdescType = getTensorDescType()) {
return llvm::cast<TypedValue<xegpu::TensorDescType>>(getSource());
@@ -1005,7 +1046,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
let hasVerifier = 1;
}
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorLayoutInterface]> {
let summary = "store data to scattered memory locations.";
let description =
[{ It (aka. store) stores data to scattered memory locations. The value is
@@ -1078,7 +1119,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
l2_hint = #xegpu.cache_hint<cached>,
l3_hint = #xegpu.cache_hint<cached>,
layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
+ : vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex>
```
Example 4 (Lane level):
@@ -1107,6 +1148,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
return getDest().getType();
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
TypedValue<xegpu::TensorDescType> getTensorDesc() {
if (auto tdescType = getTensorDescType()) {
return llvm::cast<TypedValue<xegpu::TensorDescType>>(getDest());
@@ -1217,7 +1266,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
let hasVerifier = 1;
}
-def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> {
+def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> {
let summary = "It performs mma computation";
let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
@@ -1278,6 +1327,15 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
let results = (outs XeGPU_DpasResType: $result);
let extraClassDeclaration = [{
+
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayoutCd().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutCdAttr(anchorLayout);
+ }
+
VectorType getLhsType() {
return getLhs().getType();
}
@@ -1311,7 +1369,8 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
MemoryEffects<[MemRead, MemWrite]>,
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
- AllShapesMatch<["tensorDesc", "value", "result"]>]> {
+ AllShapesMatch<["tensorDesc", "value", "result"]>,
+ AnchorLayoutInterface]> {
let summary = "Atomic read-modify-write operation on the TensorDesc. ";
let description = [{
@@ -1351,6 +1410,17 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
XeGPU_ValueType:$value,
OptionalAttr<DistributeLayoutAttr>:$layout);
+ let extraClassDeclaration = [{
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
+ }];
+
let results = (outs XeGPU_ValueType:$result);
let assemblyFormat = [{
@@ -1424,7 +1494,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
let extraClassDeclaration = extraBaseClassDeclaration;
}
-def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
+def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>, AnchorLayoutInterface]> {
let summary = "Convert the layout of the input operand";
let description = [{
`convert_layout` redistribute data across subgroups and/or lanes from the `input_layout` to
@@ -1458,6 +1528,16 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
let assemblyFormat = [{
$source prop-dict attr-dict `:` type($source)
}];
+ let extraClassDeclaration = [{
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getTargetLayout();
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setTargetLayoutAttr(anchorLayout);
+ }
+
+ }];
let hasFolder = 1;
let hasVerifier = 1;
@@ -1499,7 +1579,7 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
}
def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
- AllElementTypesMatch<["mem_desc", "res"]>]> {
+ AllElementTypesMatch<["mem_desc", "res"]>, AnchorLayoutInterface]> {
let arguments = (ins XeGPU_MemDesc:$mem_desc,
Variadic<Index>: $offsets,
DenseI64ArrayAttr: $const_offsets,
@@ -1558,13 +1638,22 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
return vecTy.getShape();
return {};
}
+
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
}];
let hasVerifier = 1;
}
def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
- AllElementTypesMatch<["mem_desc", "data"]>]> {
+ AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
let arguments = (ins
AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
XeGPU_MemDesc:$mem_desc,
@@ -1617,6 +1706,14 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
return {};
}
+ xegpu::DistributeLayoutAttr getAnchorLayout() {
+ return getLayout().value_or(nullptr);
+ }
+
+ void setAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout) {
+ setLayoutAttr(anchorLayout);
+ }
+
}];
let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 58092c3bb9ed2..46d52516cbc15 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -63,63 +63,6 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
LayoutAttr layout);
-/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr
-std::string getLayoutName(const OpOperand &operand);
-
-/// Return the attribute name for the OpResult to attach DistributeLayoutAttr
-std::string getLayoutName(const OpResult result);
-
-/// Retrieves the DistributeLayoutAttr associated with a given Value. For
-/// TensorDescType values, the DistributeLayoutAttr is extracted from the
-/// TensorDescType itself. For other values, it is obtained from the attributes
-/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is
-/// found.
-DistributeLayoutAttr getDistributeLayoutAttr(const Value value);
-
-template <typename AttrTy>
-AttrTy getDistributeLayoutAttrOfType(const Value value) {
- return dyn_cast_if_present<AttrTy>(getDistributeLayoutAttr(value));
-}
-
-/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It
-/// will first check the operand_layout_{id} of the owner operation. If not
-/// found, it will check the operand itself and its defining op.
-DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr);
-
-template <typename AttrTy>
-AttrTy getDistributeLayoutAttrOfType(const OpOperand &opr) {
- return dyn_cast_if_present<AttrTy>(getDistributeLayoutAttr(opr));
-}
-
-/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
-template <typename T,
- typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
- std::is_same_v<T, OpResult>>>
-void removeLayoutAttr(const T &operandOrResult);
-
-/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the
-/// given operation if they exist. If the operation contains regions, it is also
-/// applied recursively to the contained operations
-void removeLayoutAttrs(Operation *op);
-
-/// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching
-/// it to the owner's dictionary attributes
-/// If `respectPermLayout` is true the existing permament layout
-/// attribute will be kept and assigned to the attribute dict instead
-/// of the provided layout.
-template <typename T,
- typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
- std::is_same_v<T, OpResult>>>
-void setDistributeLayoutAttr(const T &operandOrResult,
- const DistributeLayoutAttr layout,
- bool respectPermLayout = false);
-
-/// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
-/// operation. If the operation contains regions, it is also applied recursively
-/// to the contained operations
-void setDistributeLayoutAttrs(
- Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl);
-
/// Extract a set of small vectors from a value with a given shape using
/// vector.extract_stride_slice
SmallVector<Value> extractVectorsWithShapeFromValue(OpBuilder &builder,
@@ -175,6 +118,69 @@ template <typename T>
int getLargestDivisor(T dim, ArrayRef<T> candidates,
ArrayRef<T> candidateMultiples = {});
+/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr
+std::string getTemporaryLayoutName(const OpOperand &operand);
+
+/// Return the attribute name for the OpResult to attach DistributeLayoutAttr
+std::string getTemporaryLayoutName(const OpResult result);
+
+/// Retrieves the DistributeLayoutAttr associated with a given Value. For
+/// TensorDescType values, the DistributeLayoutAttr is extracted from the
+/// TensorDescType itself. For other values, it is obtained from the attributes
+/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is
+/// found.
+DistributeLayoutAttr getDistributeLayoutAttr(const Value value);
+
+/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It
+/// will first check the operand_layout_{id} of the owner operation. If not
+/// found, it will check the operand itself and its defining op.
+DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr);
+
+/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists.
+template <typename T,
+ typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+ std::is_same_v<T, OpResult>>>
+void removeLayoutAttr(const T &operandOrResult);
+
+/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the
+/// given operation if they exist. If the operation contains regions, it is also
+/// applied recursively to the contained operations
+void removeLayoutAttrs(Operation *op);
+
+/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult
+/// user should use setAnchorLayout instead
+void setDistributeLayoutAttr(const OpResult &Result,
+ const DistributeLayoutAttr layout);
+
+/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpOperand
+/// user should use setAnchorLayout instead
+void setDistributeLayoutAttr(const OpOperand &opr,
+ const DistributeLayoutAttr layout);
+
+/// get and set distribute layout attribute for non-anchor operations
+/// (and offsets/masks of load/store ops before we get rid of their temp attrs)
+template <typename T,
+ typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+ std::is_same_v<T, OpResult>>>
+DistributeLayoutAttr getTemporaryLayout(const T &operandOrResult);
+
+template <typename T,
+ typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+ std::is_same_v<T, OpResult>>>
+void setTemporaryLayout(const T &operandOrResult,
+ const DistributeLayoutAttr layout);
+
+/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and
+/// OpResult of of the given operation. If the operation contains regions, it is
+/// also applied recursively to the contained operations operation.
+/// TODO: To be replaced by recoverTemporaryLayouts()
+void recoverTemporaryLayoutsDeprecated(Operation *op);
+
+/// Attach layout attributes to all vector-type operands of operations within
+/// the given operation's region. Reports an error if any vector operand lacks
+/// a layout attribute.
+bool recoverTemporaryLayouts(Operation *rootOp);
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 1a19ab5fd970b..ccf17da26c942 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -35,6 +35,8 @@ void XeGPUDialect::initialize() {
#include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc>
>();
}
+#define GET_OP_INTERFACE_CLASSES
+#include "mlir/Dialect/XeGPU/IR/XeGPUOpInterface.cpp.inc"
// A `srcShape` consists of N distribution units, each being `subShapesLayout` x
// `subShape`. A `delinearizedId` is used to identify a particular `subShape`
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index ec5feb8bc8c4a..ba2753f517ce6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -280,11 +280,13 @@ void XeGPUBlockingPass::runOnOperation() {
MLIRContext *ctx = &getContext();
Operation *op = getOperation();
- // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr.
- // This ensures that the LayoutAttr remains accessible even if the defining
- // operation is replaced.
- xegpu::setDistributeLayoutAttrs(
- op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); });
+ // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
+ // replace the function with recoverTemporaryLayouts
+ // if (!xegpu::recoverTemporaryLayouts(op)) {
+ // signalPassFailure();
+ // return;
+ // }
+ xegpu::recoverTemporaryLayoutsDeprecated(op);
auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
xegpu::LayoutAttr layout) {
@@ -412,14 +414,14 @@ void XeGPUBlockingPass::runOnOperation() {
op->walk([](Operation *op) {
// Remove the layout attributes cached per operands.
for (OpOperand &opr : op->getOpOperands()) {
- std::string name = xegpu::getLayoutName(opr);
+ std::string name = xegpu::getTemporaryLayoutName(opr);
if (op->hasAttrOfType<xegpu::LayoutAttr>(name))
op->removeAttr(name);
}
// Update the layout attributes per result.
for (OpResult result : op->getOpResults()) {
- std::string name = xegpu::getLayoutName(result);
+ std::string name = xegpu::getTemporaryLayoutName(result);
if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
op->removeAttr(name);
if (!isa<LoopLikeOpInterface>(op))
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
index ab41fe4298d99..bb80df197d45b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -217,14 +217,14 @@ static Value generateLoads(ConversionPatternRewriter &rewriter,
origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
// Set the layout for the loadOp.
auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
- xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
+ loadOp.setAnchorLayout(layoutAttr);
// Insert the loaded block into the right position in data.
auto insertOp = vector::InsertStridedSliceOp::create(
rewriter, loc, loadOp.getResult(), data,
ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
ArrayRef<int64_t>{1, 1});
// InsertOp must have the same layout as newTensorDesc.
- xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
+ xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
data = insertOp.getResult();
}
}
@@ -366,8 +366,8 @@ class XeGPULoadNdDescOpPattern final
auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
bitcastType, slice);
// BitCastOp must have the same layout as the original loadNdOp.
- xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
- origTensorDescType.getLayoutAttr());
+ xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
+ origTensorDescType.getLayoutAttr());
arraySlices.push_back(bitCastOp.getResult());
}
rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
@@ -384,8 +384,8 @@ class XeGPULoadNdDescOpPattern final
auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
loadNdOp.getType(), data);
// BitCastOp must have the same layout as the original loadNdOp.
- xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
- origTensorDescType.getLayoutAttr());
+ xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
+ origTensorDescType.getLayoutAttr());
rewriter.replaceOp(loadNdOp, bitCastOp);
return success();
}
@@ -465,7 +465,8 @@ struct XeGPUOptimizeBlockLoadsPass final
// converted.
target.addDynamicallyLegalOp<vector::ExtractOp>(
[&](vector::ExtractOp extractOp) {
- auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
+ auto layout = xegpu::getTemporaryLayout(
+ dyn_cast<OpResult>(extractOp.getResult()));
if (!layout)
return true;
auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 112d138c73e18..7fc75e7294ea3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1216,7 +1216,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
}
// If the result is a vector type, add a temporary layout attribute to the
// op.
- xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true);
+ xegpu::setDistributeLayoutAttr(result, layout);
}
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 54a306d63cdcd..a1c0656d0bdb5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -636,16 +636,14 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
unsigned operandIdx = operand->getOperandNumber();
- std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
- std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
- std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));
xegpu::LayoutAttr layoutA =
- dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
+ dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
xegpu::LayoutAttr layoutB =
- dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
+ dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
xegpu::LayoutAttr layoutOut =
- dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
+ dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
+
if (!layoutA || !layoutB || !layoutOut)
return rewriter.notifyMatchFailure(
dpasOp,
@@ -657,6 +655,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
+
if (failed(distLhsTypeByWarpOpOrFailure) ||
failed(distRhsTypeByWarpOpOrFailure) ||
failed(distResultTypeByWarpOpOrFailure))
@@ -685,6 +684,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
FailureOr<VectorType> expectedDistResultTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
+
if (failed(expectedDistLhsTyOrFailure) ||
failed(expectedDistRhsTyOrFailure) ||
failed(expectedDistResultTyOrFailure))
@@ -869,11 +869,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
storeScatterOp, "Expected at most 2D result at SG level");
std::string layoutPayloadName =
- xegpu::getLayoutName(storeScatterOp->getOpOperand(0));
+ xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(0));
std::string layoutOffsetsName =
- xegpu::getLayoutName(storeScatterOp->getOpOperand(2));
+ xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(2));
std::string layoutMaskName =
- xegpu::getLayoutName(storeScatterOp->getOpOperand(3));
+ xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3));
xegpu::LayoutAttr layoutPayload =
storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
@@ -1152,9 +1152,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
"Expected 1D offsets and mask vector");
// Assume offset and mask producers will be distributed as well.
std::string layoutOffsetsName =
- xegpu::getLayoutName(loadGatherOp->getOpOperand(1));
+ xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(1));
std::string layoutMaskName =
- xegpu::getLayoutName(loadGatherOp->getOpOperand(2));
+ xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(2));
xegpu::LayoutAttr layoutOffsets =
loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
@@ -1224,8 +1224,8 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
// Reduction result should have the same layout as the accumulator.
- xegpu::setDistributeLayoutAttr(cast<OpResult>(reductionResult),
- xegpu::getDistributeLayoutAttr(acc));
+ xegpu::setTemporaryLayout(cast<OpResult>(reductionResult),
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc)));
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
@@ -1240,20 +1240,24 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
vector::ExtractStridedSliceOp extractOp =
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
+
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
+
vector::ShapeCastOp slice = vector::ShapeCastOp::create(
rewriter, loc,
VectorType::get({nSliceElements}, sourceType.getElementType()),
extractOp.getResult());
+
// Shape cast is currently handled in xegpu side. So layouts must be
// retained during lowering. Shape cast output has the same layout as the
// accumulator. Shape cast source has the same layout as the original
// reduction source.
// TODO: other ops generated here may also need layout attributes.
- xegpu::setDistributeLayoutAttr(slice->getOpOperand(0),
- xegpu::getDistributeLayoutAttr(src));
- xegpu::setDistributeLayoutAttr(slice->getOpResult(0),
- xegpu::getDistributeLayoutAttr(acc));
+ auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
+ auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
+
+ xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
+ xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
// Extract and reduction results in scalars, so no result layout is needed.
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
Value reduction = vector::ReductionOp::create(
@@ -1346,7 +1350,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
cast<VectorType>(warpOp.getResult(operandIdx).getType());
VectorType resultType = cast<VectorType>(reductionOp.getType());
xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getDistributeLayoutAttr(reductionOp.getSource());
+ xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
@@ -1508,9 +1512,9 @@ struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
dyn_cast<VectorType>(broadcastOp.getResult().getType());
xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0));
+ xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getDistributeLayoutAttr(broadcastOp.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));
FailureOr<VectorType> sourceDistType;
Type sourceElemOrDistType;
@@ -1599,9 +1603,9 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
auto resultDistTy =
cast<VectorType>(warpOp.getResult(operandNumber).getType());
xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0));
+ xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
if (!sourceLayout || !resultLayout)
return rewriter.notifyMatchFailure(
warpOp,
@@ -1696,8 +1700,7 @@ struct VectorExtractStridedSliceDistribution
int64_t distributedDim = distributedDims[0];
int sourceDistrDimSize =
extractOp.getSourceVectorType().getShape()[distributedDim];
- auto sourceLayout =
- xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0));
+ auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
return rewriter.notifyMatchFailure(
warpOp, "the source of extract_strided_slice op lacks distribution "
@@ -1807,10 +1810,8 @@ struct VectorInsertStridedSliceDistribution
"rank) dims of dest vector");
int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
// Obtain the source and dest layouts.
- auto destLayout =
- xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(1));
- auto sourceLayout =
- xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(0));
+ auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
+ auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
if (!destLayout || !sourceLayout ||
destLayout.getEffectiveLaneLayoutAsInt().empty() ||
sourceLayout.getEffectiveLaneLayoutAsInt().empty())
@@ -1925,7 +1926,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
unsigned operandIdx = operand->getOperandNumber();
VectorType distributedSourceType =
getDistVecTypeBasedOnLaneLayout(
- xegpu::getDistributeLayoutAttr(bitcastOp.getSource()),
+ xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
bitcastOp.getSourceVectorType())
.value_or(VectorType());
if (!distributedSourceType)
@@ -1968,9 +1969,9 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
unsigned operandIdx = operand->getOperandNumber();
xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getDistributeLayoutAttr(transposeOp.getVector());
+ xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getDistributeLayoutAttr(transposeOp.getResult());
+ xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
if (!sourceLayout || !resultLayout)
return rewriter.notifyMatchFailure(
transposeOp,
@@ -2053,24 +2054,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
// 1) It is assumed that there are no layout conflicts.
// 2) Any existing layout attributes attached to the operands are ignored.
Operation *op = getOperation();
- op->walk([&](Operation *op) {
- for (OpOperand &operand : op->getOpOperands()) {
- // Layouts are needed for vector type only.
- if (!isa<VectorType>(operand.get().getType()))
- continue;
- if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
- continue;
-
- auto layout = xegpu::getDistributeLayoutAttr(operand.get());
- if (!layout) {
- op->emitError("Could not find layout attribute for operand ")
- << operand.getOperandNumber() << " of operation " << op->getName();
- signalPassFailure();
- return;
- }
- xegpu::setDistributeLayoutAttr(operand, layout);
- }
- });
+ if (!xegpu::recoverTemporaryLayouts(op)) {
+ signalPassFailure();
+ return;
+ }
+
// Step 2: Move all operations of a GPU function inside
// gpu.warp_execute_on_lane_0 operation.
{
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 95e27e46d90ab..07572a4950760 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -425,10 +425,11 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
if (resultTy.getRank() != 2)
return failure();
- auto originalLayout = xegpu::getDistributeLayoutAttr(op.getResult());
- if (!originalLayout)
+ auto layoutCd = op.getLayoutCdAttr();
+ auto layoutA = op.getLayoutAAttr();
+ auto layoutB = op.getLayoutBAttr();
+ if (!layoutCd || !layoutA || !layoutB)
return failure();
-
size_t i = 0;
SmallVector<Value> newDpasOps;
for (auto aVec : adaptor.getLhs()) {
@@ -447,11 +448,12 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
llvm::cast<VectorType>(bVec.getType()).getShape();
VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]},
resultTy.getElementType());
- tmpC = xegpu::DpasOp::create(rewriter, loc, resTy, operands);
- xegpu::setDistributeLayoutAttr(cast<OpResult>(tmpC),
- originalLayout.dropSgLayoutAndData());
+ auto newDpasOp = xegpu::DpasOp::create(rewriter, loc, resTy, operands);
+ newDpasOp.setLayoutCdAttr(layoutCd.dropSgLayoutAndData());
+ newDpasOp.setLayoutAAttr(layoutA.dropSgLayoutAndData());
+ newDpasOp.setLayoutBAttr(layoutB.dropSgLayoutAndData());
- newDpasOps.push_back(tmpC);
+ newDpasOps.push_back(newDpasOp);
}
}
rewriter.replaceOpWithMultiple(op, {newDpasOps});
@@ -491,7 +493,7 @@ struct WgToSgVectorBroadcastOp
ArrayRef<int64_t> wgShape = resultType.getShape();
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(llvm::cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -506,8 +508,8 @@ struct WgToSgVectorBroadcastOp
for (auto operand : adaptor.getOperands().front()) {
auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(),
newResultType, operand);
- xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(newBroadcast->getResult(0),
+ layout.dropSgLayoutAndData());
newBroadcastOps.push_back(newBroadcast.getResult());
}
@@ -534,7 +536,7 @@ struct WgToSgElementwiseOp : public ConversionPattern {
ArrayRef<int64_t> wgShape = resultType.getShape();
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op->getResult(0));
+ xegpu::getTemporaryLayout(llvm::cast<OpResult>(op->getResult(0)));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -740,7 +742,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
return failure();
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -754,8 +756,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
auto eltType = vecType.getElementType();
auto setLayout = [&](Value val) {
- xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(val),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(llvm::dyn_cast<OpResult>(val),
+ layout.dropSgLayoutAndData());
};
if (vecAttr.isSplat()) {
@@ -901,8 +903,8 @@ struct WgToSgLoadGatherOpWithOffset
return failure();
ArrayRef<int64_t> wgShape = resultType.getShape();
- xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -930,7 +932,7 @@ struct WgToSgLoadGatherOpWithOffset
rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
newLayout);
- xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout);
+ newLoadOp.setAnchorLayout(newLayout);
newLoadOps.push_back(newLoadOp);
}
rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -955,8 +957,8 @@ struct WgToSgStoreScatterOpWithOffset
if (!valueType)
return failure();
- xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getOperand(0));
+ xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -985,7 +987,7 @@ struct WgToSgStoreScatterOpWithOffset
// Skip for operand one (memref)
if (operand.getOperandNumber() == 1)
continue;
- xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(operand, layout.dropSgLayoutAndData());
}
}
rewriter.eraseOp(op);
@@ -1050,7 +1052,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -1078,12 +1080,12 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]);
auto finalSteps =
arith::AddIOp::create(rewriter, loc, steps, bcastOffset);
- xegpu::setDistributeLayoutAttr(steps->getResult(0),
- layout.dropSgLayoutAndData());
- xegpu::setDistributeLayoutAttr(bcastOffset->getResult(0),
- layout.dropSgLayoutAndData());
- xegpu::setDistributeLayoutAttr(finalSteps->getResult(0),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(steps->getResult(0),
+ layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(bcastOffset->getResult(0),
+ layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(finalSteps->getResult(0),
+ layout.dropSgLayoutAndData());
newOps.push_back(finalSteps);
}
@@ -1107,7 +1109,7 @@ struct WgToSgVectorShapeCastOp
ArrayRef<int64_t> wgShape = resultType.getShape();
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -1172,8 +1174,8 @@ struct WgToSgVectorShapeCastOp
for (auto src : adaptor.getSource()) {
auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(),
newResultType, src);
- xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(newShapeCast->getResult(0),
+ layout.dropSgLayoutAndData());
newShapeCastOps.push_back(newShapeCast.getResult());
}
@@ -1201,7 +1203,7 @@ struct WgToSgMultiDimReductionOp
auto srcShape = srcType.getShape();
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -1234,8 +1236,8 @@ struct WgToSgMultiDimReductionOp
auto newOp = vector::MultiDimReductionOp::create(
rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc,
adaptor.getAcc()[0], op.getReductionDims());
- xegpu::setDistributeLayoutAttr(newOp->getResult(0),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(newOp->getResult(0),
+ layout.dropSgLayoutAndData());
newReductions.push_back(newOp.getResult());
}
@@ -1256,10 +1258,10 @@ struct WgToSgVectorTransposeOp
ArrayRef<int64_t> wgShape = resultType.getShape();
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
-
+ // TODO-LayoutRefactor: handle the case using getTemporaryLayout
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getDistributeLayoutAttr(op.getVector());
if (!sourceLayout || !sourceLayout.isForWorkgroup())
@@ -1298,8 +1300,8 @@ struct WgToSgVectorTransposeOp
for (auto src : adaptor.getVector()) {
auto newTranspose = vector::TransposeOp::create(
rewriter, op.getLoc(), newResultType, src, permutation);
- xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(newTranspose->getResult(0),
+ layout.dropSgLayoutAndData());
newTransposeOps.push_back(newTranspose.getResult());
}
@@ -1318,7 +1320,7 @@ struct WgToSgVectorMaskOp : public OpConversionPattern<MaskOpType> {
typename OpConversionPattern<MaskOpType>::OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op.getResult());
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
return failure();
@@ -1368,8 +1370,8 @@ struct WgToSgVectorMaskOp : public OpConversionPattern<MaskOpType> {
auto newCreateMaskOp =
vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands);
- xegpu::setDistributeLayoutAttr(newCreateMaskOp->getResult(0),
- layout.dropSgLayoutAndData());
+ xegpu::setTemporaryLayout(newCreateMaskOp->getResult(0),
+ layout.dropSgLayoutAndData());
newCreateMaskOps.push_back(newCreateMaskOp.getResult());
}
@@ -1409,6 +1411,14 @@ struct XeGPUWgToSgDistributePass
} // namespace
void XeGPUWgToSgDistributePass::runOnOperation() {
+
+ // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
+ // Operation *op = getOperation();
+ // if (!xegpu::recoverTemporaryLayouts(op)) {
+ // signalPassFailure();
+ // return;
+ // }
+
// Track existing UnrealizedConversionCastOps
SmallVector<Operation *> existingCastOps;
getOperation()->walk([&](UnrealizedConversionCastOp castOp) {
@@ -1499,7 +1509,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
});
target.addDynamicallyLegalOp<xegpu::DpasOp>([=](xegpu::DpasOp op) -> bool {
- auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+ auto layout = op.getLayoutCdAttr();
return isLegal(layout);
});
@@ -1519,7 +1529,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
if (!vecType)
return true;
- auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+ auto layout =
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
return isLegal(layout);
});
@@ -1529,19 +1540,20 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
vector::ConstantMaskOp, vector::CreateMaskOp>(
[=](Operation *op) -> bool {
// Check for either a SliceAttr or LayoutAttr on the result.
- auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+ auto layout =
+ xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
return isLegal(layout);
});
target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
[=](xegpu::LoadGatherOp op) -> bool {
- auto layout = xegpu::getDistributeLayoutAttr(op.getResult());
+ auto layout = op.getLayoutAttr();
return isLegal(layout);
});
target.addDynamicallyLegalOp<xegpu::StoreScatterOp>(
[=](xegpu::StoreScatterOp op) -> bool {
- auto layout = xegpu::getDistributeLayoutAttr(op.getOperand(0));
+ auto layout = op.getLayoutAttr();
return isLegal(layout);
});
@@ -1571,7 +1583,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
}
xegpu::DistributeLayoutAttr layout =
- xegpu::getDistributeLayoutAttr(op->getResult(0));
+ xegpu::getTemporaryLayout(op->getResult(0));
return isLegal(layout);
});
@@ -1596,7 +1608,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
// Layout propagation pass will activated.
getOperation()->walk([](Operation *op) {
for (OpResult result : op->getOpResults()) {
- std::string name = xegpu::getLayoutName(result);
+ std::string name = xegpu::getTemporaryLayoutName(result);
if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
op->removeAttr(name);
if (!isa<scf::IfOp, scf::ForOp, scf::WhileOp, scf::ConditionOp>(op)) {
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 9f126fe8c2415..d3906e37ffbf1 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -101,13 +101,13 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
return xegpu::getDistributedVectorType(helperTdescTy);
}
-std::string xegpu::getLayoutName(const OpOperand &operand) {
+std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
const StringRef prefix("layout_operand_");
unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
return llvm::formatv("{0}{1}", prefix, idx).str();
}
-std::string xegpu::getLayoutName(const OpResult result) {
+std::string xegpu::getTemporaryLayoutName(const OpResult result) {
const StringRef prefix = "layout_result_";
return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
}
@@ -124,29 +124,17 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
- // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
- if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
- return convertOp.getTargetLayoutAttr();
-
- // for LoadNdOp, the layout is stored in the tensor descriptor
- if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
- return getDistributeLayoutAttr(loadNd.getTensorDesc());
-
- // for LoadMatrixOp, the layout is attached to the property of the op
- if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
- return loadOp.getLayoutAttr();
-
- // for StoreMatrixOp, the layout is attached to the property of the op
- if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
- return storeOp.getLayoutAttr();
- std::string layoutName = getLayoutName(result);
- if (defOp->hasAttr(layoutName))
- return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-
- // check for "permament" layout only after "temporary" layout name lookup
- // for backward compatibility
- if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
- return loadGatherOp.getLayoutAttr();
+ if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
+ auto layout = anchorOp.getAnchorLayout();
+ return layout;
+ }
+
+ std::string layoutName = getTemporaryLayoutName(result);
+ if (defOp->hasAttr(layoutName)) {
+ auto layout =
+ defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+ return layout;
+ }
}
if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -160,27 +148,46 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
return nullptr;
}
-
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
+ unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
+
+ if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
+ if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
+ if (idx == 0) {
+ return dpasOp.getLayoutAAttr();
+ } else if (idx == 1) {
+ return dpasOp.getLayoutBAttr();
+ } else if (idx == 2) {
+ return dpasOp.getLayoutCdAttr();
+ }
+ }
+ if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
+ return convertOp.getInputLayoutAttr();
+ }
+ auto layout = anchorOp.getAnchorLayout();
- if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
- return loadOp.getLayoutAttr();
-
- if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
- return storeOp.getLayoutAttr();
-
- std::string layoutName = xegpu::getLayoutName(opr);
- if (op->hasAttr(layoutName))
- return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+ if (idx == 0)
+ return layout;
- // check for "permament" layout only after "temporary" layout name lookup
- if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
- if (auto layout = storeScatterOp.getLayoutAttr())
+ // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
+ // the layout is valid for the first two operands: value and memref/tdesc.
+ // For other operations, the layout applies to the first operand only.
+ if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
+ op) &&
+ (idx < 2))
return layout;
+ }
- return getDistributeLayoutAttr(opr.get());
+ std::string layoutName = xegpu::getTemporaryLayoutName(opr);
+ if (op->hasAttr(layoutName)) {
+ auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+ return layout;
+ }
+
+ auto layout = getDistributeLayoutAttr(opr.get());
+ return layout;
}
// Returns the permanent layout attribute for the given result if it's
@@ -218,55 +225,157 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
return candidate;
}
-template <typename T, typename>
-void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
- const DistributeLayoutAttr layout,
- bool respectPermLayout) {
- Operation *owner = operandOrResult.getOwner();
- std::string name = xegpu::getLayoutName(operandOrResult);
+// TODO-LayoutRefactor: Remove this function after replacing use
+// with setTemporaryLayout or setAnchorLayout
+void xegpu::setDistributeLayoutAttr(
+ const mlir::OpResult &result,
+ const mlir::xegpu::DistributeLayoutAttr layout) {
+ Operation *owner = result.getOwner();
- if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+ if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
+ if (anchorOp.getAnchorLayout() == layout)
+ return;
+ anchorOp.setAnchorLayout(layout);
return;
+ }
- DistributeLayoutAttr candidate = layout;
- if (respectPermLayout)
- candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name);
+ std::string name = xegpu::getTemporaryLayoutName(result);
+ if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
+ return;
+ }
+ if (layout) {
+ owner->setAttr(name, layout);
+ }
+}
- if (candidate)
- owner->setAttr(name, candidate);
+// TODO-LayoutRefactor: Remove this function after replacing use
+// with setTemporaryLayout or setAnchorLayout
+void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
+ const DistributeLayoutAttr layout) {
+ Operation *owner = operand.getOwner();
+ unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
+
+ if (!layout) {
+ return;
+ }
+ if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
+ if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
+ if (idx == 0) {
+ return dpasOp.setLayoutAAttr(layout);
+ } else if (idx == 1) {
+ return dpasOp.setLayoutBAttr(layout);
+ } else if (idx == 2) {
+ return dpasOp.setLayoutCdAttr(layout);
+ }
+ }
+ if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
+ return convertOp.setInputLayoutAttr(layout);
+ }
+
+ // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
+ // the layout is valid for the first two operands: value and memref/tdesc.
+ // For other operations, the layout applies to the first operand only.
+ if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
+ owner)) {
+ if (idx < 2) {
+ anchorOp.setAnchorLayout(layout);
+ }
+ } else {
+ if (idx == 0) {
+ anchorOp.setAnchorLayout(layout);
+ }
+ }
+ }
+
+ std::string name = xegpu::getTemporaryLayoutName(operand);
+ if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
+ return;
+ }
+ if (layout) {
+ owner->setAttr(name, layout);
+ }
}
-// Explicit instantiation for OpResult
-template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
+template <typename T, typename>
+xegpu::DistributeLayoutAttr
+xegpu::getTemporaryLayout(const T &operandOrResult) {
+ Operation *op = operandOrResult.getOwner();
+
+ std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
+ if (op->hasAttr(layoutName)) {
+ auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+ return layout;
+ }
+
+ return nullptr;
+}
+
+template xegpu::DistributeLayoutAttr
+xegpu::getTemporaryLayout<mlir::OpResult>(const OpResult &result);
+template xegpu::DistributeLayoutAttr
+xegpu::getTemporaryLayout<mlir::OpOperand>(const OpOperand &operand);
+
+template <typename T, typename>
+void xegpu::setTemporaryLayout(const T &operandOrResult,
+ const xegpu::DistributeLayoutAttr layout) {
+ Operation *owner = operandOrResult.getOwner();
+ std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
+ if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
+ return;
+ }
+ if (layout) {
+ owner->setAttr(name, layout);
+ }
+}
+
+template void xegpu::setTemporaryLayout<mlir::OpResult>(
const mlir::OpResult &result,
- const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
+ const mlir::xegpu::DistributeLayoutAttr layout);
-// Explicit instantiation for OpOperand
-template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
+template void xegpu::setTemporaryLayout<mlir::OpOperand>(
const mlir::OpOperand &operand,
- const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
+ const mlir::xegpu::DistributeLayoutAttr layout);
-void xegpu::setDistributeLayoutAttrs(
- Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
+void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
op->walk([&](Operation *nestOp) {
- if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
- return;
-
for (OpOperand &opr : nestOp->getOpOperands()) {
- auto layout = getLayoutImpl(opr.get());
+ auto layout = getDistributeLayoutAttr(opr.get());
setDistributeLayoutAttr(opr, layout);
}
+
for (OpResult result : nestOp->getOpResults()) {
- auto layout = getLayoutImpl(result);
+ auto layout = getDistributeLayoutAttr(result);
setDistributeLayoutAttr(result, layout);
}
});
}
+/// Attach layout attributes to all vector-type operands of operations within
+/// the given operation's region. Reports an error if any vector operand lacks
+/// a layout attribute.
+bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+ auto result = rootOp->walk([&](Operation *op) {
+ for (OpOperand &operand : op->getOpOperands()) {
+ // Layouts are needed for vector type only.
+ if (!isa<VectorType>(operand.get().getType()))
+ continue;
+ auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+ if (!layout) {
+ op->emitError("Could not find layout attribute for operand ")
+ << operand.getOperandNumber() << " of operation " << op->getName();
+ return WalkResult::interrupt();
+ }
+ xegpu::setDistributeLayoutAttr(operand, layout);
+ }
+ return WalkResult::advance();
+ });
+ return !result.wasInterrupted();
+}
+
template <typename T, typename>
void xegpu::removeLayoutAttr(const T &operandOrResult) {
Operation *owner = operandOrResult.getOwner();
- std::string name = xegpu::getLayoutName(operandOrResult);
+ std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
owner->removeAttr(name);
}
@@ -285,6 +394,14 @@ void xegpu::removeLayoutAttrs(Operation *op) {
removeLayoutAttr(opr);
for (OpResult result : nestOp->getOpResults())
removeLayoutAttr(result);
+ if (op->hasAttrOfType<DistributeLayoutAttr>("layout"))
+ op->removeAttr("layout");
+ if (op->hasAttrOfType<DistributeLayoutAttr>("layout_a"))
+ op->removeAttr("layout_a");
+ if (op->hasAttrOfType<DistributeLayoutAttr>("layout_b"))
+ op->removeAttr("layout_b");
+ if (op->hasAttrOfType<DistributeLayoutAttr>("layout_cd"))
+ op->removeAttr("layout_cd");
});
}
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
index 24a0de6ed48a5..c748c1ca5ef88 100644
--- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
+++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
@@ -10,7 +10,7 @@
// CHECK: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64
// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME: {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[B]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16>
@@ -38,7 +38,7 @@ gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x
// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64
// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME: {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK: %[[T3:.*]] = vector.bitcast %[[T2]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8>
@@ -73,7 +73,7 @@ gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8
// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME: <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16>
@@ -115,8 +115,8 @@ gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16
// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
-// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout<
-// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] <{layout = #xegpu.layout<
+// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16>
@@ -159,13 +159,13 @@ gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %ar
// CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
-// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
// CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]}
// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32>
// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
-// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
// CHECK: %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]]
// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]}
@@ -225,12 +225,12 @@ gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2
// CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
-// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
// CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16>
// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
-// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
// CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16>
@@ -244,12 +244,12 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
%c32 = arith.constant 32 : index
%c256 = arith.constant 256 : index
%0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
- %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+ %1 = xegpu.load_nd %0[%c0, %c0] { layout = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
%3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16>
-> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
%4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
-> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
- %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b }
+ %6 = xegpu.load_nd %3[%c0, %arg3] { layout = #b }
: !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16>
%19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
%20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
@@ -265,10 +265,10 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
%12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
%13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
%14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
- %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %15 = xegpu.dpas %arg0, %11, %arg4 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %16 = xegpu.dpas %arg0, %12, %arg5 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %17 = xegpu.dpas %arg0, %13, %arg6 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %18 = xegpu.dpas %arg0, %14, %arg7 {layout = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
} {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 32fb3178a8af2..5f70831f45e97 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -8,7 +8,7 @@
// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
// CHECK: xegpu.prefetch_nd %[[TDESC_SRC]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [8, 16]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
-// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout<inst_data = [8, 16]>}>
// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
gpu.module @test {
@@ -32,11 +32,11 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<inst_data = [16, 16]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<inst_data = [8, 16]>, layout_b = #xegpu.layout<inst_data = [16, 16]>, layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<inst_data = [8, 16]>, layout_b = #xegpu.layout<inst_data = [16, 16]>, layout_cd = #xegpu.layout<inst_data = [8, 16]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -73,7 +73,7 @@ gpu.module @test_kernel {
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+ //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [8, 16]>}> :
//CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
%a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
%b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
@@ -112,7 +112,7 @@ gpu.module @test_kernel {
%out:3 = scf.for %k = %c0 to %c1024 step %c32
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
- //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [4, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+ //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout<inst_data = [4, 16]>}> :
//CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
%a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
%b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
@@ -141,7 +141,7 @@ gpu.module @test {
// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout<inst_data = [16, 8]>}>
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64, layout = #xegpu.layout<inst_data = [16, 8]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 48e77d867508b..b88d8e1a78a26 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -6,11 +6,11 @@ gpu.module @test {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -32,7 +32,7 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
gpu.module @test {
// CHECK-LABEL: func.func @dpas_i8(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
-// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
@@ -47,7 +47,7 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
gpu.module @test {
// CHECK-LABEL: func.func @load_with_transpose_effect(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
@@ -109,7 +109,7 @@ gpu.module @test {
// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> ->
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16>
func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
@@ -136,7 +136,7 @@ gpu.module @test {
// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> ->
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32>
func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -185,7 +185,7 @@ gpu.module @test {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
@@ -204,7 +204,7 @@ gpu.module @test {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
func.func @scatter_ops(%src: memref<256xf16>) {
%1 = arith.constant dense<1>: vector<16xi1>
@@ -221,7 +221,7 @@ gpu.module @test {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
@@ -241,7 +241,7 @@ gpu.module @test {
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} dense<12> : vector<16xindex>
// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK-SAME: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
@@ -257,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
-// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
-// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16>
// CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x16xi16> to vector<8x16xf16>
@@ -282,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
// CHECK-SAME: vector<16x8xi32> to vector<16x16xf16>
@@ -303,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}>
// CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
// CHECK-SAME: vector<8x32xi16> to vector<8x16xi32>
@@ -340,9 +340,9 @@ gpu.module @test {
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16>
func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
@@ -363,7 +363,7 @@ gpu.module @test {
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
@@ -386,11 +386,11 @@ gpu.module @test {
// CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) ->
// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
// CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -426,11 +426,11 @@ gpu.module @test {
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16>
// CHECK-NEXT: } else {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16>
// CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
@@ -456,11 +456,11 @@ gpu.module @test {
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
// CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) {
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16>
// CHECK-NEXT: } else {
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> :
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16>
// CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -600,7 +600,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
@@ -622,7 +622,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1]
@@ -645,7 +645,7 @@ gpu.module @test {
// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>
// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 7819a438057c4..b136c89925682 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -140,10 +140,9 @@ gpu.func @dpas(%laneid: index) {
%2 = "some_op"() : () -> vector<8x16xf32>
%3 = xegpu.dpas %0, %1, %2
{
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}
: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
gpu.yield %3 : vector<8x16xf32>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index e5e3d2a1c1ad5..87c67ba6bf324 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -20,18 +20,20 @@ gpu.module @xevm_module{
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
-> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.load_nd %0[%c0, %c0]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
%2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
-> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
%3 = xegpu.load_nd %2[%c0, %c0]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-> vector<16x16xf16>
%4 = xegpu.dpas %1, %3
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
%5 = math.exp %4
@@ -84,7 +86,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
%2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%3 = xegpu.load_nd %2[%0, %1]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
%4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
@@ -95,14 +97,16 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
-> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
%7 = xegpu.load_nd %5[%0, %arg3]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
%8 = xegpu.load_nd %6[%arg3, %1]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
: !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
%9 = xegpu.dpas %7, %8, %arg4
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
scf.yield %9 : vector<8x16xf32>
@@ -137,7 +141,7 @@ gpu.module @xevm_module{
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
%loaded = scf.if %pred -> (vector<16x8xf16>) {
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+ layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
scf.yield %3 : vector<16x8xf16>
} else {
@@ -169,7 +173,7 @@ gpu.module @xevm_module{
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
scf.if %pred {
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+ layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
}
@@ -194,17 +198,20 @@ gpu.module @xevm_module{
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
-> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
%2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
-> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
- %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ %3 = xegpu.load_nd %2[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
%4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
: vector<16x8xi32> to vector<16x16xf16>
%5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
: vector<16x16xf16> to vector<16x16xf16>
- %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ %6 = xegpu.dpas %1, %5
+ {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
%7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
-> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -341,7 +348,7 @@ gpu.module @xevm_module{
-> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
%1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
// CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
%2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
@@ -356,7 +363,7 @@ gpu.module @xevm_module{
gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
%c0 = arith.constant 0 : index
%mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
- %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
+ %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
%11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
%2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
index 561034fb5880b..13ed24ebf0a3a 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -149,7 +149,7 @@ func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>,
%4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
%5 = xegpu.load_nd %4[0, 0] : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
// CHECK: = xegpu.dpas
- // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+ // CHECK-SAME: {layout_cd = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
%6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d61908b422194..9f9edcd416ddf 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -15,7 +15,7 @@ gpu.module @test_kernel {
%n = arith.muli %block_id_y, %c32 : index
%c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
- %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+ %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
%a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
%b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
@@ -23,11 +23,11 @@ gpu.module @test_kernel {
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
- //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ //CHECK-COUNT-8: xegpu.dpas {{.*}}
+ %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
@@ -36,7 +36,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+ xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
}
}
@@ -56,7 +56,7 @@ gpu.module @test_kernel {
%n = arith.muli %block_id_y, %c32 : index
%c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1>
- %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
+ %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
%a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1>
%b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2>
@@ -64,11 +64,11 @@ gpu.module @test_kernel {
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
- //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
+ //CHECK-COUNT-8: xegpu.dpas {{.*}}
+ %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16>
@@ -77,7 +77,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
+ xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
gpu.return
}
}
@@ -100,7 +100,7 @@ gpu.module @test_kernel {
%c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1>
//CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
- %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32>
+ %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32>
%a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1>
%b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2>
@@ -108,10 +108,10 @@ gpu.module @test_kernel {
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) {
//CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
+ %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
//CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
- %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
+ %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
+ %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
//CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1>
//CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16>
@@ -120,7 +120,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>
}
//CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %out#2, %c_tdesc: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
+ xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
gpu.return
}
}
@@ -141,7 +141,7 @@ gpu.module @test_kernel {
%n = arith.muli %block_id_y, %c32 : index
%c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
- %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+ %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
%a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
%b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
@@ -149,13 +149,13 @@ gpu.module @test_kernel {
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
-> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+ %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
//CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
%e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
- //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ //CHECK-COUNT-8: xegpu.dpas {{.*}}
+ %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
//CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
@@ -164,7 +164,7 @@ gpu.module @test_kernel {
: !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
}
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
- xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+ xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
}
}
@@ -188,14 +188,14 @@ gpu.module @test_kernel {
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) {
//CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+ %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
+ %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
//CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
%c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
- xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
+ xegpu.store_nd %c, %arg2 {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
//CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l>
@@ -227,14 +227,14 @@ gpu.module @test_kernel {
iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
-> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) {
//CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16>
- %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
- %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+ %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
+ %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
//CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
%c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
- xegpu.store_nd %c, %arg2: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
+ xegpu.store_nd %c, %arg2 {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
//CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16>
%a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l>
@@ -257,12 +257,12 @@ gpu.module @test_kernel {
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c64 : index
%0 = xegpu.create_nd_tdesc %a[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
- %1 = xegpu.load_nd %0: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
+ %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32>
// CHECK-COUNT-3: vector.multi_reduction <add>, {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32>
%2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32>
%3 = xegpu.create_nd_tdesc %b[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
- xegpu.store_nd %2, %3: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
+ xegpu.store_nd %2, %3 {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
gpu.return
}
}
@@ -282,14 +282,14 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c32 : index
%n = arith.muli %block_id_y, %c32 : index
%0 = xegpu.create_nd_tdesc %a[%m, %n] : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l>
- %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32>
+ %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32>
// CHECK-COUNT-1: vector.multi_reduction <add>, {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32>
%2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32>
%3 = xegpu.create_nd_tdesc %b[%n] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
- xegpu.store_nd %2, %3: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
+ xegpu.store_nd %2, %3 {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
gpu.return
}
}
@@ -304,11 +304,11 @@ gpu.module @test_kernel {
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c64 : index
%0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
- %1 = xegpu.load_nd %0: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
+ %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
// CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32>
%2 = vector.broadcast %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32>
%3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
- xegpu.store_nd %2, %3: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
+ xegpu.store_nd %2, %3 {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
gpu.return
}
}
@@ -323,7 +323,7 @@ gpu.module @test_kernel {
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c32 : index
%0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
- %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
+ %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
%11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32>
// CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
%2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32>
@@ -343,24 +343,25 @@ gpu.module @test_kernel {
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c32 : index
%0 = xegpu.create_nd_tdesc %a[%m, 0] : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l>
- %1 = xegpu.load_nd %0: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
+ %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
// CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32>
%2 = vector.transpose %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32>
%3 = xegpu.create_nd_tdesc %b[0, %m] : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t>
- xegpu.store_nd %2, %3: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
+ xegpu.store_nd %2, %3 {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
gpu.return
}
}
// -----
+
gpu.module @test_kernel {
// CHECK-LABEL: test_prefetch_load_store_update
// CHECK-SAME: [[arg0:%.+]]: ui64
// CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
- // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+ // CHECK-COUNT-2: xegpu.prefetch {{.*}}
// CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
- // CHECK-COUNT-2: xegpu.load {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
- // CHECK-COUNT-2: xegpu.store {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+ // CHECK-COUNT-2: xegpu.load {{.*}}
+ // CHECK-COUNT-2: xegpu.store {{.*}}
gpu.func @test_prefetch_load_store_update(%src: ui64) {
@@ -372,7 +373,7 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+ xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
%delta = arith.constant dense<[
32, 32, 32, 32, 32, 32, 32, 32,
@@ -386,10 +387,10 @@ gpu.module @test_kernel {
%c17 = arith.constant 17: index
%mask = vector.create_mask %c17: vector<32xi1>
- %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+ %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
%st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
- xegpu.store %st_vec, %tdesc, %mask:
+ xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout<inst_data = [16]>}:
vector<32xf32>,
!xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>,
vector<32xi1>
@@ -400,15 +401,14 @@ gpu.module @test_kernel {
}
// -----
-
gpu.module @test_kernel {
// CHECK-LABEL: test_prefetch_load_store_update_chunk
// CHECK-SAME: [[arg0:%.+]]: ui64
// CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ // CHECK-COUNT-4: xegpu.prefetch {{.*}} <{layout = #xegpu.layout<inst_data = [16, 2]>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
// CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
- // CHECK-COUNT-4: xegpu.load {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
- // CHECK-COUNT-4: xegpu.store {{.*}} : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
+ // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
+ // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) {
@@ -420,7 +420,7 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+ xegpu.prefetch %tdesc {layout = #xegpu.layout<inst_data = [16, 2]>}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
%delta = arith.constant dense<[
32, 32, 32, 32, 32, 32, 32, 32,
@@ -434,10 +434,10 @@ gpu.module @test_kernel {
%c17 = arith.constant 17: index
%mask = vector.create_mask %c17: vector<32xi1>
- %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
+ %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<32x4xf32>
%st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>:
+ xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #xegpu.layout<inst_data = [16, 2]>}>:
vector<32x4xf32>,
!xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>,
vector<32xi1>
@@ -476,7 +476,7 @@ gpu.module @test_kernel {
]> : vector<4x8xindex>
%tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
- xegpu.prefetch %tdesc: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
+ xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>
%delta = arith.constant dense<[
[32, 32, 32, 32, 32, 32, 32, 32],
@@ -490,10 +490,10 @@ gpu.module @test_kernel {
%c4 = arith.constant 4: index
%mask = vector.create_mask %c4, %c4: vector<4x8xi1>
- %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
+ %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>, vector<4x8xi1> -> vector<4x8x4xf32>
%st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32>
- xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>:
+ xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, layout = #l}>:
vector<4x8x4xf32>,
!xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #l>,
vector<4x8xi1>
@@ -513,13 +513,13 @@ gpu.module @test_kernel {
//CHECK: [[c0:%.+]] = arith.constant 0 : index
//CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
//CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
- //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
- //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
//CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
//CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
//CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
- //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]]
+ //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]]
//CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
//CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
//CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
@@ -529,12 +529,12 @@ gpu.module @test_kernel {
%c0 = arith.constant 0 : index
%a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
%b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
- %a = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
- %b = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+ %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
%e = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
- %c = xegpu.dpas %e, %b {layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ %c = xegpu.dpas %e, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
%c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
- xegpu.store_nd %c, %c_tdesc: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
+ xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
gpu.return
}
}
@@ -599,7 +599,7 @@ gpu.module @test_kernel {
%c17 = arith.constant 17: index
%mask = vector.create_mask %c17: vector<32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+ %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
gpu.return %ld : vector<32xf32>
}
@@ -621,10 +621,7 @@ gpu.module @test_kernel {
%mask = vector.create_mask %c17: vector<32xi1>
%st_vec = arith.constant dense<1023.0>: vector<32xf32>
- xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<inst_data = [16]>,
- layout_operand_2 = #xegpu.layout<inst_data = [16]>,
- layout_operand_3 = #xegpu.layout<inst_data = [16]>,
- l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
+ xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
}
@@ -649,7 +646,7 @@ gpu.module @test_kernel {
%c17 = arith.constant 17: index
%mask = vector.create_mask %c17: vector<32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
+ %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
gpu.return %ld : vector<32x4xf32>
}
}
@@ -675,10 +672,7 @@ gpu.module @test_kernel {
%mask = vector.create_mask %c17: vector<32xi1>
%st_vec = arith.constant dense<1023.>: vector<32x4xf32>
- xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout_operand_0 = #xegpu.layout<inst_data = [16, 2]>,
- layout_operand_2 = #xegpu.layout<inst_data = [16, 2]>,
- layout_operand_3 = #xegpu.layout<inst_data = [16, 2]>,
- l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
+ xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
}
}
@@ -704,7 +698,7 @@ gpu.module @test_kernel {
]]> : vector<1x1x32xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+ %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
gpu.return %ld : vector<1x1x32xf32>
}
@@ -752,11 +746,11 @@ gpu.module @test_kernel {
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
%c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
- %a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
- %b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+ %a = xegpu.load_nd %a_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+ %b = xegpu.load_nd %b_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
%result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32>
- xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
+ xegpu.store_nd %result, %c_tdesc[%c0, %c0] {layout = #l}: vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
gpu.return
}
}
@@ -778,10 +772,10 @@ gpu.module @test_kernel {
128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]]
]> : vector<1x1x32xindex>
%mask = arith.constant {layout_result_0 = #inst_data} dense<true> : vector<1x1x32xi1>
- %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
- %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout_result_0 = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+ %a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+ %b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
%addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32>
- xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout_operand_0 = #inst_data, layout_operand_2 = #inst_data, layout_operand_3 = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1>
+ xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index dbc52b8a98894..c3be138fef38a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -226,7 +226,7 @@ gpu.module @test {
%c17 = arith.constant 17: index
%mask = vector.create_mask %c17: vector<32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+ %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
gpu.return %ld : vector<32xf32>
}
@@ -381,7 +381,7 @@ gpu.module @test {
%c17 = arith.constant 17: index
%mask = vector.create_mask %c17: vector<32xi1>
- %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
+ %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
gpu.return %ld : vector<32x4xf32>
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 9580769d37313..6e9711442b92d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -6,7 +6,7 @@ gpu.module @test_elementwise_ops {
gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) {
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
-> vector<24x32xf32>
// CHECK: math.exp {{.*}} : vector<12x8xf32>
@@ -24,7 +24,7 @@ gpu.module @test_elementwise_ops {
gpu.func @unary_ops(%a: memref<24x32xf32>) {
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
// CHECK: math.exp {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
@@ -44,10 +44,10 @@ gpu.module @test_elementwise_ops {
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
// CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -71,13 +71,13 @@ gpu.module @test_elementwise_ops {
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
%tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1>
-> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_c = xegpu.load_nd %tdesc_c
+ %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi1>
// CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -99,10 +99,10 @@ gpu.module @test_elementwise_ops {
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32>
-> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
// CHECK: arith.truncf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -128,16 +128,16 @@ gpu.module @test_elementwise_ops {
-> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
%tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32>
-> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_c = xegpu.load_nd %tdesc_c
+ %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
- %load_d = xegpu.load_nd %tdesc_d
+ %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
-> vector<24x32xi32>
// CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
@@ -160,10 +160,10 @@ gpu.module @test_elementwise_ops {
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
-> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-> vector<24x32xf32>
- %load_b = xegpu.load_nd %tdesc_b
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
-> vector<24x32xf32>
// CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 4829af3612de3..6b8b4f282b744 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -41,7 +41,7 @@ gpu.module @test_round_robin_assignment {
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
// CHECK-NOT: xegpu.load_nd
- %load = xegpu.load_nd %tdesc
+ %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
gpu.return
@@ -54,8 +54,7 @@ gpu.module @test_round_robin_assignment {
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NOT: xegpu.store_nd
- %load = xegpu.load_nd %tdesc
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
xegpu.store_nd %load, %tdesc
: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -79,20 +78,20 @@ gpu.module @test_round_robin_assignment {
gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
// CHECK-NOT: xegpu.dpas
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
-> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
- : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16>
-> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- %load_b = xegpu.load_nd %tdesc_b
- : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-> vector<128x256xf16>
- %dpas = xegpu.dpas %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ %dpas = xegpu.dpas %load_a, %load_b
+ {layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
gpu.return
}
@@ -114,7 +113,7 @@ gpu.module @test_round_robin_assignment {
gpu.func @broadcast(%src: memref<128x1xf32>) {
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32>
-> !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc
+ %load = xegpu.load_nd %tdesc {layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
: !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-> vector<128x1xf32>
// CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
@@ -137,7 +136,7 @@ gpu.module @test_round_robin_assignment {
// CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
%2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1)
-> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
- %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %3 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
xegpu.store_nd %3, %arg3 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
%4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
%5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -153,7 +152,7 @@ gpu.module @test_round_robin_assignment {
%c10_i32 = arith.constant 10 : i32
%c0_i32 = arith.constant 0 : i32
%0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
%2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
// CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
%3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
@@ -166,7 +165,7 @@ gpu.module @test_round_robin_assignment {
xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
%4 = arith.addi %arg3, %c1_i32 : i32
%5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %6 = xegpu.load_nd %5 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
scf.yield %6, %4 : vector<256xf32>, i32
}
gpu.return
@@ -181,12 +180,12 @@ gpu.module @test_round_robin_assignment {
// CHECK-LABEL: scf.if
// CHECK-SAME: (vector<16xf32>, vector<16xf32>)
%4 = scf.if %3 -> (vector<256xf32>) {
- %5 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %5 = xegpu.load_nd %1 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>, vector<16xf32>
scf.yield %5 : vector<256xf32>
} else {
- %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %5 = xegpu.load_nd %2 {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>, vector<16xf32>
scf.yield %5 : vector<256xf32>
@@ -200,7 +199,7 @@ gpu.module @test_round_robin_assignment {
%id = gpu.subgroup_id : index
%t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %d = xegpu.load_nd %t {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}: !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
%0 = arith.cmpi eq, %id, %c10 : index
// CHECK-LABEL: scf.if
@@ -224,7 +223,7 @@ gpu.module @test_round_robin_assignment {
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
// CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
// CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
+ %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
gpu.return
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index 37a76f316e75a..ad346307437e4 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -14,11 +14,11 @@ gpu.module @test_distribution {
// CHECK-LABEL: load_nd_tdesc_with_offset
gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
- // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
+ // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
// CHECK-NOT: xegpu.load_nd
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
gpu.return
@@ -30,7 +30,7 @@ gpu.module @test_distribution {
// CHECK-NOT: xegpu.store_nd
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
xegpu.store_nd %load, %tdesc[0, 0]
@@ -53,23 +53,25 @@ gpu.module @test_distribution {
// CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
// CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
- // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+ // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
// CHECK-NOT: xegpu.dpas
%tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
-> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a[0, 0]
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16>
-> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- %load_b = xegpu.load_nd %tdesc_b[0, 0]
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
: !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-> vector<128x256xf16>
%dpas = xegpu.dpas %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
gpu.return
}
@@ -80,7 +82,7 @@ gpu.module @test_distribution {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
-> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>}
: !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
-> vector<256x64xf32>
// CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32>
@@ -119,7 +121,7 @@ gpu.module @test_distribution {
gpu.func @vector_transpose(%src: memref<256x128xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-> vector<256x128xf32>
// CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 69eb8ce9dfba5..da6ad976d3730 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -38,10 +38,10 @@ gpu.module @test_distribution {
//CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index
//CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
//CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index
- //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+ //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
gpu.return
@@ -53,7 +53,7 @@ gpu.module @test_distribution {
//CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
xegpu.store_nd %load, %tdesc[0, 0]
@@ -75,42 +75,49 @@ gpu.module @test_distribution {
// CHECK-LABEL: dpas
gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
- // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+ // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
%tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a[0, 0]
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<128x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- %load_b = xegpu.load_nd %tdesc_b[0, 0]
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-> vector<128x128xf16>
%dpas = xegpu.dpas %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
gpu.return
}
// CHECK-LABEL: dpas_no_sg_data
gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
- // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
%tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
order = [1, 0]>>
- %load_a = xegpu.load_nd %tdesc_a[0, 0]
+ %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+ order = [1, 0]>}
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
order = [1, 0]>>
-> vector<128x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
order = [1, 0]>>
- %load_b = xegpu.load_nd %tdesc_b[0, 0]
+ %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]> }
: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
order = [1, 0]>>
-> vector<128x128xf16>
%dpas = xegpu.dpas %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+ order = [1, 0]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+ order = [1, 0]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
gpu.return
}
@@ -119,7 +126,9 @@ gpu.module @test_distribution {
gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
// CHECK-NOT: vector<32x32xf32>
%dpas = xegpu.dpas %a, %b
- {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
gpu.return
}
@@ -129,7 +138,7 @@ gpu.module @test_distribution {
gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32>
-> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
-> vector<256x1xf32>
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
@@ -145,7 +154,7 @@ gpu.module @test_distribution {
gpu.func @broadcast_dim0(%src: memref<1x128xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32>
-> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<1x128xf32>
// CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -175,9 +184,9 @@ gpu.module @test_distribution {
%3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
%4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
// load_nd with offset
- %5 = xegpu.load_nd %2[%0, %1] : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
- %6 = xegpu.load_nd %3[%0, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %7 = xegpu.load_nd %4[%c0, %1] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+ %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
+ %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+ %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
// scf.for loop
// CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
// CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
@@ -189,10 +198,13 @@ gpu.module @test_distribution {
%8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5)
-> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) {
// load_nd with offset inside loop
- %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
- : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
- %10 = xegpu.load_nd %3[%arg3, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %11 = xegpu.load_nd %4[%c0, %arg3] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+ %9 = xegpu.dpas %arg4, %arg5, %arg6
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+ : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+ %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+ %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
}
// store_nd with offset
@@ -215,7 +227,7 @@ gpu.module @test_distribution {
// CHECK-NOT: index.sub
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
-> vector<256x128xf32>
} {sg_id_range = #xegpu.range<[0, 32]>}
@@ -228,7 +240,7 @@ gpu.module @test_distribution {
// CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
%tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32>
-> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
: !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
-> vector<128x64xf32>
%exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
@@ -244,7 +256,7 @@ gpu.module @test_distribution {
%c32 = arith.constant 32 : index
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
-> vector<256x128xf32>
%cond1 = arith.cmpi sge, %sg_id, %c3 : index
@@ -257,7 +269,7 @@ gpu.module @test_distribution {
// CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]]
%td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32>
-> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
- %ld = xegpu.load_nd %td[0, 0]
+ %ld = xegpu.load_nd %td[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
: !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
-> vector<128x64xf32>
%exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
@@ -275,7 +287,7 @@ gpu.module @test_distribution {
// CHECK-SAME: : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
%offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<0> : vector<256x16xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<1> : vector<256x16xi1>
- %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
+ %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
: memref<?xf16>, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16>
gpu.return
}
@@ -287,13 +299,11 @@ gpu.module @test_distribution {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
// CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
- // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
- // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
- // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
+ // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
%val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
%offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1>
- xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
+ xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
l1_hint = #xegpu.cache_hint<cached>}
@@ -310,7 +320,7 @@ gpu.module @test_distribution {
// CHECK-SAME: : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
%offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
- %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}
+ %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}
: memref<?xf16>, vector<256xindex>, vector<256xi1> -> vector<256x4xf16>
gpu.return
}
@@ -370,7 +380,7 @@ gpu.module @test_distribution {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} dense<1.0> : vector<128xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32>
-> !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>}
: !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
-> vector<4x128xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
@@ -384,7 +394,7 @@ gpu.module @test_distribution {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
-> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>}
: !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
-> vector<256x64xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
@@ -398,7 +408,7 @@ gpu.module @test_distribution {
%cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} dense<0.0> : vector<4x2x6xf16>
%offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<0> : vector<4x2x6x32xindex>
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<true> : vector<4x2x6x32xi1>
- %load = xegpu.load %src[%offset], %mask {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16>
+ %load = xegpu.load %src[%offset], %mask {layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
%reduce = vector.multi_reduction <add>, %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} [3]
: vector<4x2x6x32xf16> to vector<4x2x6xf16>
@@ -468,7 +478,7 @@ gpu.module @test_distribution {
gpu.func @vector_transpose(%src: memref<256x32xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
-> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
- %load = xegpu.load_nd %tdesc[0, 0]
+ %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
: !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-> vector<256x32xf32>
//CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
@@ -624,9 +634,8 @@ gpu.module @test_distribution {
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
// CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
- // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
// CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
- %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
+ %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<32xf32> to vector<32x32xf32>
%4 = vector.broadcast %3 {layout_result_0 =
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index a8015cced7eb4..50081ed34fe78 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -89,17 +89,17 @@ gpu.module @test_1_1_assignment {
gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
- %load_a = xegpu.load_nd %tdesc_a
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<128x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
- %load_b = xegpu.load_nd %tdesc_b
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-> vector<128x128xf16>
- // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+ // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
%dpas = xegpu.dpas %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
gpu.return
}
@@ -109,20 +109,20 @@ gpu.module @test_1_1_assignment {
%tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
order = [1, 0]>>
- %load_a = xegpu.load_nd %tdesc_a
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
+ %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
order = [1, 0]>>
-> vector<128x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16>
-> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
order = [1, 0]>>
- %load_b = xegpu.load_nd %tdesc_b
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
+ %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
order = [1, 0]>>
-> vector<128x128xf16>
- // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
%dpas = xegpu.dpas %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
gpu.return
}
@@ -145,7 +145,9 @@ gpu.module @test_1_1_assignment {
gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) {
// CHECK-NOT: vector<32x32xf32>
%dpas = xegpu.dpas %a, %b
- {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], inst_data=[8, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], inst_data=[8, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], inst_data=[8, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32>
gpu.return
}
@@ -194,26 +196,28 @@ gpu.module @test_1_1_assignment {
%0 = arith.muli %block_id_x, %c128 : index
%1 = arith.muli %block_id_y, %c128 : index
%2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
+ %3 = xegpu.load_nd %2 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
%4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
%5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
// CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
// CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
// CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
- // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
- // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+ // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}> : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+ // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}> : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
// CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
// CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
// CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
// CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
%6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
- -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
- !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
- %8 = xegpu.load_nd %arg4 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %9 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
- %10 = xegpu.dpas %8, %9, %arg6 {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
- : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+ -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
+ %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+ %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+ %10 = xegpu.dpas %8, %9, %arg6
+ {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
+ layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+ : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
%11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
%12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
@@ -221,7 +225,7 @@ gpu.module @test_1_1_assignment {
}
%7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32>
-> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- xegpu.store_nd %6#2, %7 : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+ xegpu.store_nd %6#2, %7 {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
gpu.return
}
@@ -230,7 +234,7 @@ gpu.module @test_1_1_assignment {
%c10_i32 = arith.constant 10 : i32
%c0_i32 = arith.constant 0 : i32
%0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %1 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
%2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
// CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
@@ -244,7 +248,7 @@ gpu.module @test_1_1_assignment {
xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
%4 = arith.addi %arg3, %c1_i32 : i32
%5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %6 = xegpu.load_nd %5 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
scf.yield %6, %4 : vector<256xf32>, i32
}
gpu.return
@@ -263,14 +267,14 @@ gpu.module @test_1_1_assignment {
%5 = scf.if %4 -> (vector<256xf32>) {
// CHECK-LABEL: xegpu.load_nd
// CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %2 = xegpu.load_nd %0 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>
scf.yield %2 : vector<256xf32>
} else {
// CHECK-LABEL: xegpu.load_nd
// CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %3 = xegpu.load_nd %1 {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>
scf.yield %3 : vector<256xf32>
diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
index 3f2fff9ab51e9..37f6d33e8ac30 100644
--- a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
@@ -53,34 +53,34 @@ module @gemm attributes {gpu.container_module} {
%m = arith.muli %block_id_x, %c256 : index
%n = arith.muli %block_id_y, %c256 : index
%c_tdesc = xegpu.create_nd_tdesc %C : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #c>
- %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32>
+ %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] {layout = #c}: !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32>
%a_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a>
%b_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b>
// Prefetch A 3 times.
%a_prefetch_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a_prefetch>
- xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
- xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
- xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+ xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+ xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+ xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
// Prefetch B 3 times.
%b_prefetch_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b_prefetch>
- xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
- xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
- xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+ xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+ xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+ xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
%out = scf.for %k = %c0 to %c256 step %c32
iter_args(%c_value = %c_init_value)
-> (vector<256x256xf32>) {
- %a_value = xegpu.load_nd %a_tdesc[%m, %k] : !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16>
- %b_value = xegpu.load_nd %b_tdesc[%k, %n] : !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16>
+ %a_value = xegpu.load_nd %a_tdesc[%m, %k] {layout = #a}: !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16>
+ %b_value = xegpu.load_nd %b_tdesc[%k, %n] {layout = #b}: !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16>
// Prefetch next tiles.
%prefetch_offset = arith.addi %k, %c96 : index
- xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
- xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
- %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_result_0 = #c}
+ xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] {layout = #a_prefetch}: !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+ xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] {layout = #b_prefetch}: !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+ %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_a = #a, layout_b = #b, layout_cd = #c}
: vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32>
scf.yield %c_new_value : vector<256x256xf32>
}
- xegpu.store_nd %out, %c_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c>
+ xegpu.store_nd %out, %c_tdesc[%m, %n] {layout = #c}: vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c>
gpu.return
}
}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 826d07d1b6eca..1a1520dfa975d 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -183,7 +183,7 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
matchAndRewrite(vector::StepOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto layoutName = xegpu::getLayoutName(op->getResult(0));
+ auto layoutName = xegpu::getTemporaryLayoutName(op->getResult(0));
auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
if (!sliceAttr || sliceAttr.getRank() != 1)
return failure();
@@ -322,7 +322,7 @@ struct TestXeGPULayoutInterface
target.addDynamicallyLegalOp<vector::StepOp>(
[&](vector::StepOp op) -> bool {
- auto layoutName = xegpu::getLayoutName(op->getResult(0));
+ auto layoutName = xegpu::getTemporaryLayoutName(op->getResult(0));
auto sliceAttr = op->getAttrOfType<xegpu::SliceAttr>(layoutName);
return isLegal(sliceAttr);
});
More information about the Mlir-commits
mailing list