[Mlir-commits] [mlir] [MLIR][XeGPU] Clean up the temporary layout usage in XeGPU test (PR #195739)
Jianhui Li
llvmlistbot at llvm.org
Tue May 5 09:24:59 PDT 2026
https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/195739
>From 485f1803518648383b45f05f6565e32e770c5d15 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 23 Apr 2026 22:24:35 +0000
Subject: [PATCH 01/11] remove recusive loop in getDistributeLayoutAttr. clean
the layout before recover, need to fix test as next step
---
.../Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 16 +---------------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 3 ++-
2 files changed, 3 insertions(+), 16 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 7d48315eec6ff..8188cfa08779d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -33,20 +33,6 @@
using namespace mlir;
-void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
- op->walk([&](Operation *nestOp) {
- for (OpOperand &opr : nestOp->getOpOperands()) {
- auto layout = getDistributeLayoutAttr(opr.get());
- setDistributeLayoutAttr(opr, layout);
- }
-
- for (OpResult result : nestOp->getOpResults()) {
- auto layout = getDistributeLayoutAttr(result);
- setDistributeLayoutAttr(result, layout);
- }
- });
-}
-
SmallVector<NamedAttribute>
xegpu::dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
SmallVector<NamedAttribute> out;
@@ -293,7 +279,7 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
}
});
};
-
+ removeTemporaryLayoutAttrs(rootOp);
rootOp->walk([&](func::FuncOp func) {
processFunc(func.getBody(), func.getSymName());
});
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 2d1ce6eea17aa..13288a377e69a 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -162,7 +162,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
if (tiedInit)
- return getDistributeLayoutAttr(tiedInit->get());
+ // return getDistributeLayoutAttr(tiedInit->get());
+ return getTemporaryLayout(*tiedInit);
}
}
>From c47269160cef27bab664a0b257aef671084c8b69 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 02:46:01 +0000
Subject: [PATCH 02/11] fix tests
---
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 2 +-
.../Transforms/XeGPUWgToSgDistribute.cpp | 9 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 10 ++
.../Dialect/XeGPU/sg-to-wi-experimental.mlir | 8 +-
.../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 52 ++++++-
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 44 +++++-
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 137 ++++++++++++++++++
7 files changed, 248 insertions(+), 14 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 2d5e1a6397278..eda7b18bd978a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -134,7 +134,7 @@ static void propagateResultsToRegularOperands(Operation *op) {
result.setType(typeWithLayout);
}
}
- if (isa<VectorType>(resultType) && resLayout)
+ if (resLayout)
xegpu::setTemporaryLayout(result, resLayout);
for (OpOperand &opr : op->getOpOperands()) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 8aa0758943cd1..af82effb9d379 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1562,7 +1562,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
vector::TransposeOp, vector::BroadcastOp,
- vector::MultiDimReductionOp,
vector::ConstantMaskOp, vector::CreateMaskOp>(
[=](Operation *op) -> bool {
// Check for either a SliceAttr or LayoutAttr on the result.
@@ -1570,7 +1569,13 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
return isLegal(layout);
});
-
+ target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+ [=](Operation *op) -> bool {
+ // Check operand since the result maybe scalar not bearing layout..
+ auto layout =
+ xegpu::getTemporaryLayout(dyn_cast<vector::MultiDimReductionOp>(op)->getOpOperand(0));
+ return isLegal(layout);
+ });
target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
[=](xegpu::LoadGatherOp op) -> bool {
auto layout = op.getLayoutAttr();
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 08b334ddec3fc..8d65ea497ad88 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -32,6 +32,11 @@ gpu.func @load_nd() {
%0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ %anchor = xegpu.convert_layout %1
+ <{
+ input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<16x16xf16>
gpu.return
}
@@ -116,6 +121,11 @@ gpu.func @dpas() {
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %anchor = xegpu.convert_layout %4
+ <{
+ input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<8x16xf32>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index c4401515414b6..e02bd9b0370ad 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -253,10 +253,16 @@ gpu.module @xevm_module{
layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
%5 = math.exp %4
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf32>
%6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
+ %anchor = xegpu.convert_layout %5
+ <{
+ input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<8x16xf32>
+ xegpu.store_nd %anchor, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 53ce8d0e38949..94e8b7504a1d6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -14,9 +14,14 @@ gpu.module @test_elementwise_ops {
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
: vector<24x32xf32>
// CHECK: arith.negf {{.*}} : vector<12x8xf32>
- %negf = arith.negf %load_a
+ %negf = arith.negf %exp
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
: vector<24x32xf32>
+ %anchor = xegpu.convert_layout %negf
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>
+ }> : vector<24x32xf32>
gpu.return
}
@@ -32,9 +37,14 @@ gpu.module @test_elementwise_ops {
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
// CHECK: arith.negf {{.*}} : vector<12x8xf32>
- %negf = arith.negf %load_a
+ %negf = arith.negf %exp
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
+ %anchor = xegpu.convert_layout %negf
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+ }> : vector<24x32xf32>
gpu.return
}
@@ -55,9 +65,14 @@ gpu.module @test_elementwise_ops {
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
// CHECK: math.powf {{.*}}, {{.*}} : vector<12x8xf32>
- %powf = math.powf %load_a, %load_b
+ %powf = math.powf %addf, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
+ %anchor = xegpu.convert_layout %powf
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+ }> : vector<24x32xf32>
gpu.return
}
@@ -83,9 +98,14 @@ gpu.module @test_elementwise_ops {
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi1>, vector<24x32xf32>
// CHECK: math.fma {{.*}}, {{.*}}, {{.*}} : vector<12x8xf32>
- %fma = math.fma %load_a, %load_b, %load_a
+ %fma = math.fma %load_a, %load_b, %select
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
+ %anchor = xegpu.convert_layout %fma
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+ }> : vector<24x32xf32>
gpu.return
}
@@ -105,10 +125,15 @@ gpu.module @test_elementwise_ops {
%truncf = arith.truncf %load_a
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32> to vector<24x32xf16>
- // CHECK: arith.bitcast {{.*}} : vector<12x8xi32> to vector<12x8xf32>
- %bitcast = arith.bitcast %load_b
+ // CHECK: arith.bitcast {{.*}} : vector<12x8xf16> to vector<12x8xi16>
+ %bitcast = arith.bitcast %truncf
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : vector<24x32xi32> to vector<24x32xf32>
+ : vector<24x32xf16> to vector<24x32xi16>
+ %anchor = xegpu.convert_layout %bitcast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+ }> : vector<24x32xi16>
gpu.return
}
@@ -142,6 +167,12 @@ gpu.module @test_elementwise_ops {
%cmpi = arith.cmpi eq, %load_c, %load_d
{layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi32>
+ %res = arith.select %cmpi, %cmpi, %cmpf : vector<24x32xi1>, vector<24x32xi1>
+ %anchor = xegpu.convert_layout %res
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>
+ }> : vector<24x32xi1>
gpu.return
}
@@ -165,9 +196,14 @@ gpu.module @test_elementwise_ops {
: vector<24x32xf32>
// CHECK-COUNT-12: math.powf {{.*}}, {{.*}} : vector<2x2xf32>
// CHECK-NOT: math.powf
- %powf = math.powf %load_a, %load_b
+ %powf = math.powf %negf, %load_b
{layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: vector<24x32xf32>
+ %anchor = xegpu.convert_layout %powf
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>
+ }> : vector<24x32xf32>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 17a5db6b8401d..fefe2091d458d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -21,6 +21,11 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
+ %anchor = xegpu.convert_layout %load
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<256x128xf32>
gpu.return
}
@@ -91,6 +96,11 @@ gpu.module @test_distribution {
// CHECK-NOT: arith.addf
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
: vector<256x64xf32> to vector<256xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>
+ }> : vector<256xf32>
gpu.return
}
@@ -130,6 +140,11 @@ gpu.module @test_distribution {
// CHECK-NOT: vector.transpose
%trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x128xf32> to vector<128x256xf32>
+ %anchor = xegpu.convert_layout %trans
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<128x256xf32>
gpu.return
}
@@ -138,6 +153,11 @@ gpu.module @test_distribution {
// CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
// CHECK-NOT: vector.create_mask
%constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+ %anchor = xegpu.convert_layout %constant_mask
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>
+ }> : vector<256x128xi1>
gpu.return
}
@@ -146,6 +166,11 @@ gpu.module @test_distribution {
// CHECK-NOT: vector.create_mask
%cst16 = arith.constant 16 : index
%constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+ %anchor = xegpu.convert_layout %constant_mask
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>
+ }> : vector<256x128xi1>
gpu.return
}
@@ -160,8 +185,8 @@ gpu.module @test_distribution {
%2 = vector.multi_reduction <maximumf>, %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32>
%3 = vector.shape_cast %2 {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} : vector<256xf32> to vector<256x1xf32>
%4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : vector<256x1xf32>to vector<256x128xf32>
- %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
- xegpu.store_nd %4, %9[%block_id_x, 0] : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
+ %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+ xegpu.store_nd %4, %9[%block_id_x, 0] <{layout =#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>}>: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
gpu.return
}
@@ -214,6 +239,11 @@ gpu.module @test_distribution {
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>} dense<true> : vector<8x256xi1>
%val = xegpu.load %arg0[%offset], %mask <{layout = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>}> : memref<2048xf32, 1>, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32>
%reduce = vector.multi_reduction <add>, %val, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>} [1] : vector<8x256xf32> to vector<8xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>
+ }> : vector<8xf32>
gpu.return
}
@@ -240,6 +270,11 @@ gpu.module @test_distribution {
// CHECK: %[[RES1:.*]] = vector.broadcast %[[ADD4]] : vector<4xindex> to vector<16x4xindex>
%2 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>, dims = [0]>} : vector<8xindex>
%bcast = vector.broadcast %2 {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>} : vector<8xindex> to vector<256x8xindex>
+ %anchor = xegpu.convert_layout %bcast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>,
+ target_layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>
+ }> : vector<256x8xindex>
gpu.return
}
@@ -265,6 +300,11 @@ gpu.module @test_distribution {
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
: vector<128x1xf32> to vector<128x64xf32>
+ %anchor = xegpu.convert_layout %broadcast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>
+ }> : vector<128x64xf32>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index f2cc05808ed12..c9aff190d84d7 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -43,6 +43,11 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
+ %anchor = xegpu.convert_layout %load
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<256x128xf32>
gpu.return
}
@@ -133,6 +138,10 @@ gpu.module @test_distribution {
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
: vector<256x1xf32> to vector<256x32xf32>
+ %anchor = xegpu.convert_layout %broadcast
+ <{input_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}>
+ : vector<256x32xf32>
gpu.return
}
@@ -148,6 +157,11 @@ gpu.module @test_distribution {
%broadcast = vector.broadcast %load
{layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<1x128xf32> to vector<32x128xf32>
+ %anchor = xegpu.convert_layout %broadcast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<32x128xf32>
gpu.return
}
@@ -232,6 +246,11 @@ gpu.module @test_distribution {
: !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
-> vector<128x64xf32>
%exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+ %anchor = xegpu.convert_layout %exp
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>
+ }> : vector<128x64xf32>
}{sg_id_range = #xegpu.range<[2, 18]>}
gpu.return
}
@@ -261,6 +280,11 @@ gpu.module @test_distribution {
: !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
-> vector<128x64xf32>
%exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+ %anchor = xegpu.convert_layout %exp
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>
+ }> : vector<128x64xf32>
}
} {sg_id_range = #xegpu.range<[3, 19]>}
gpu.return
@@ -374,6 +398,10 @@ gpu.module @test_distribution {
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} [0]
: vector<4x128xf32> to vector<128xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>}>
+ : vector<128xf32>
gpu.return
}
@@ -388,6 +416,10 @@ gpu.module @test_distribution {
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} [1]
: vector<256x64xf32> to vector<256xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>}>
+ : vector<256xf32>
gpu.return
}
@@ -400,6 +432,11 @@ gpu.module @test_distribution {
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
%reduce = vector.multi_reduction <add>, %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} [3]
: vector<4x2x6x32xf16> to vector<4x2x6xf16>
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>
+ }> : vector<4x2x6xf16>
gpu.return
}
@@ -427,6 +464,11 @@ gpu.module @test_distribution {
-> vector<32x32xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>} [0, 1]
: vector<32x32xf32> to f32
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>
+ }> : f32
gpu.return
}
@@ -446,6 +488,11 @@ gpu.module @test_distribution {
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
%step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
+ %anchor = xegpu.convert_layout %step
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>
+ }> : vector<128xindex>
gpu.return
}
@@ -461,6 +508,11 @@ gpu.module @test_distribution {
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
%step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
+ %anchor = xegpu.convert_layout %step
+ <{
+ input_layout = #xegpu.layout<sg_layout = [16], sg_data = [8]>,
+ target_layout = #xegpu.layout<sg_layout = [16], sg_data = [8]>
+ }> : vector<128xindex>
gpu.return
}
@@ -478,6 +530,11 @@ gpu.module @test_distribution {
%muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
//CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex>
%shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex> to vector<1x1x1x128xindex>
+ %anchor = xegpu.convert_layout %shape_cast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>
+ }> : vector<1x1x1x128xindex>
gpu.return
}
@@ -486,6 +543,11 @@ gpu.module @test_distribution {
%muli = arith.muli %arg0, %arg1 : index
// CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex>
%broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : index to vector<4x2x6x32xindex>
+ %anchor = xegpu.convert_layout %broadcast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>
+ }> : vector<4x2x6x32xindex>
gpu.return
}
@@ -499,6 +561,11 @@ gpu.module @test_distribution {
//CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32>
%trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<256x32xf32> to vector<32x256xf32>
+ %anchor = xegpu.convert_layout %trans
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<32x256xf32>
gpu.return
}
@@ -568,6 +635,11 @@ gpu.module @test_distribution {
gpu.func @scalar_broadcast(%arg0: index) {
// CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex>
%broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
+ %anchor = xegpu.convert_layout %broadcast
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>
+ }> : vector<4x1x1xindex>
gpu.return
}
@@ -582,6 +654,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
%constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+ %anchor = xegpu.convert_layout %constant_mask
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
+ target_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>
+ }> : vector<32xi1>
gpu.return
}
@@ -603,6 +680,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
%constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+ %anchor = xegpu.convert_layout %constant_mask
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+ }> : vector<256x128xi1>
gpu.return
}
@@ -618,6 +700,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
%cst8 = arith.constant 8 : index
%constant_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+ %anchor = xegpu.convert_layout %constant_mask
+ <{
+ input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
+ target_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>
+ }> : vector<32xi1>
gpu.return
}
@@ -640,6 +727,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
%cst16 = arith.constant 16 : index
%constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+ %anchor = xegpu.convert_layout %constant_mask
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+ }> : vector<256x128xi1>
gpu.return
}
@@ -656,6 +748,11 @@ gpu.module @test_distribution {
// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32>
%4 = vector.broadcast %3 {layout_result_0 =
#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+ %anchor = xegpu.convert_layout %4
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>
+ }> : vector<256x256xf32>
gpu.return
}
@@ -684,6 +781,11 @@ gpu.module @test_distribution {
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<true> : vector<1x32x32xi1>
%14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} : memref<?xf32>, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32>
%15 = vector.multi_reduction <add>, %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32>
+ %anchor = xegpu.convert_layout %15
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>
+ }> : vector<1x32xf32>
gpu.return
}
@@ -721,6 +823,11 @@ gpu.module @test_distribution {
-> vector<256x128xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
: vector<256x128xf32> to vector<128xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>
+ }> : vector<128xf32>
gpu.return
}
@@ -749,6 +856,11 @@ gpu.module @test_distribution {
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<true> : vector<2x2x128x128xi1>
%load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} : memref<?xf32>, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>
+ }> : vector<2x2xf32>
gpu.return
}
@@ -777,6 +889,11 @@ gpu.module @test_distribution {
%mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<true> : vector<32x32x128x128xi1>
%load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} : memref<?xf32>, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
+ %anchor = xegpu.convert_layout %reduce
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>
+ }> : vector<32x32xf32>
gpu.return
}
@@ -790,6 +907,11 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16],lane_layout = [1, 16], lane_data = [1, 1]>}>
: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<256x128xf32>
+ %anchor = xegpu.convert_layout %load
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<256x128xf32>
gpu.return
}
@@ -858,6 +980,11 @@ gpu.module @test_distribution {
%1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>> -> vector<128x256xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>,
target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>}> : vector<128x256xf32>
+ %anchor = xegpu.convert_layout %2
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>
+ }> : vector<128x256xf32>
gpu.return
}
@@ -896,6 +1023,11 @@ gpu.module @test_distribution {
%1 = xegpu.load %arg0[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} : memref<?xf32>, vector<8x128x256xindex>, vector<8x128x256xi1> -> vector<8x128x256xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>,
target_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>}> : vector<8x128x256xf32>
+ %anchor = xegpu.convert_layout %2
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>
+ }> : vector<8x128x256xf32>
gpu.return
}
@@ -945,6 +1077,11 @@ gpu.module @test_distribution {
%bcast2 = vector.broadcast %scast2 {layout_result_0 =
#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>, layout_operand_0 =
#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>} : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32>
+ %anchor = xegpu.convert_layout %bcast2
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>
+ }> : vector<256x16x16x256x16x16xf32>
gpu.return
}
>From eb8574e16ce2f32a741aa67550fad4282c7cb433 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 16:46:17 +0000
Subject: [PATCH 03/11] fix tests
---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 10 +
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 113 +++++++
.../XeGPU/sg-to-wi-experimental-unit.mlir | 318 ++++++------------
.../Dialect/XeGPU/sg-to-wi-experimental.mlir | 14 +-
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 9 +-
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 14 +-
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 40 ++-
7 files changed, 285 insertions(+), 233 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 2dd8d9f610faf..cafd3f392ff72 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -117,6 +117,16 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
ArrayRef<int64_t> srcShape);
+/// Infers the source layout attribute for an insert operation.
+/// using same logic as inferInsertStridedSliceSourceLayout
+DistributeLayoutAttr inferInsertSourceLayout(DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape);
+
+DistributeLayoutAttr inferExtractSourceLayout(DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape);
+
/// Infers the layout attribute for mask and offset operand for Chunked load
/// and store, given the anchor layout attribute for the value being load/store.
DistributeLayoutAttr
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index eda7b18bd978a..5cd1a8e9c83ec 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -286,6 +286,11 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
rootOp->walk([&](gpu::GPUFuncOp func) {
processFunc(func.getBody(), func.getName());
});
+ // dump out the root op here for debug purpose
+
+ llvm::dbgs() << "After recovering temporary layout attributes for function: "
+ << rootOp->getName() << "\n";
+ rootOp->dump();
return true;
}
@@ -470,6 +475,88 @@ xegpu::DistributeLayoutAttr xegpu::inferInsertStridedSliceSourceLayout(
return resLayout;
}
+/// Infers the source layout attribute for an insert operation
+/// given the result layout attribute, result shape, and source shape. Removes
+/// leading dimensions from the result layout to match the source shape size.
+xegpu::DistributeLayoutAttr
+xegpu::inferInsertSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape) {
+
+ int srcShapeSize = srcShape.size();
+ int resShapeSize = resShape.size();
+ int dimDiff = resShapeSize - srcShapeSize;
+
+ if (dimDiff > 0) {
+ // assert that the leading dimensions being sliced off are not distributed
+ // (i.e. sg_layout and lane_layout for those dimensions are all 1)
+ auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
+ auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+ for (int i = 0; i < dimDiff; i++) {
+ assert((resSgLayout.size() == 0 || resSgLayout[i] == 1) &&
+ (resLaneLayout.size() == 0 || resLaneLayout[i] == 1) &&
+ "Leading dimensions being sliced off must not be distributed");
+ }
+ return resLayout.dropDims(llvm::to_vector(llvm::seq<int64_t>(0, dimDiff)));
+ }
+ return resLayout;
+}
+
+/// Infers the source layout attribute for extract operation
+/// given the result layout attribute, result shape, and source shape. Adds
+/// leading dimensions to the source layout to match the source shape size.
+xegpu::DistributeLayoutAttr
+xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
+ ArrayRef<int64_t> resShape,
+ ArrayRef<int64_t> srcShape) {
+
+ int srcShapeSize = srcShape.size();
+ int resShapeSize = resShape.size();
+ int dimDiff = srcShapeSize - resShapeSize;
+ auto context = resLayout.getContext();
+ // construct the source layout by adding unit dimensions to the front of
+ // result layout
+
+ SmallVector<int64_t> sgLayout(srcShapeSize, 1);
+ SmallVector<int64_t> sgData(srcShapeSize, 1);
+ SmallVector<int64_t> instData(srcShapeSize, 1);
+ SmallVector<int64_t> laneLayout(srcShapeSize, 1);
+ SmallVector<int64_t> laneData(srcShapeSize, 1);
+
+ if (dimDiff > 0) {
+ auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
+ auto resSgData = resLayout.getEffectiveSgDataAsInt();
+ auto resInstData = resLayout.getEffectiveInstDataAsInt();
+ auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+ auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
+
+ for (int i = 0; i < resShapeSize; i++) {
+ sgLayout[dimDiff + i] = (resSgLayout.size() == 0) ? 1 : resSgLayout[i];
+ sgData[dimDiff + i] = (resSgData.size() == 0) ? 1 : resSgData[i];
+ instData[dimDiff + i] = (resInstData.size() == 0) ? 1 : resInstData[i];
+ laneLayout[dimDiff + i] =
+ (resLaneLayout.size() == 0) ? 1 : resLaneLayout[i];
+ laneData[dimDiff + i] = (resLaneData.size() == 0) ? 1 : resLaneData[i];
+ }
+
+ auto toAttr = [&](ArrayRef<int64_t> v) -> DenseI32ArrayAttr {
+ if (v.empty())
+ return DenseI32ArrayAttr();
+ SmallVector<int32_t> v32(v.begin(), v.end());
+ return DenseI32ArrayAttr::get(context, v32);
+ };
+ auto srcLayout = xegpu::LayoutAttr::get(
+ context, resSgLayout.empty() ? nullptr : toAttr(sgLayout),
+ resSgData.empty() ? nullptr : toAttr(sgData),
+ resInstData.empty() ? nullptr : toAttr(instData),
+ resLaneLayout.empty() ? nullptr : toAttr(laneLayout),
+ resLaneData.empty() ? nullptr : toAttr(laneData), nullptr);
+ // TODO: add layout attribute interface: expandDims
+ return srcLayout;
+ }
+ return resLayout;
+}
+
/// Infers the source layout attribute for a shape cast operation given the
/// result layout attribute, result shape, and source shape.
xegpu::DistributeLayoutAttr
@@ -1573,6 +1660,32 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
return resLayout;
}
+ // For vector::Insert Op, infer source layout from result layout using
+ // shapes.
+ if (auto insert = dyn_cast<vector::InsertOp>(op)) {
+ VectorType resVecTy = dyn_cast<VectorType>(insert.getResult().getType());
+ VectorType valueToStoreTy =
+ dyn_cast<VectorType>(insert.getValueToStore().getType());
+
+ if (idx == 0) {
+ return xegpu::inferInsertSourceLayout(resLayout, resVecTy.getShape(),
+ valueToStoreTy.getShape());
+ }
+ if (idx == 1)
+ return resLayout;
+ }
+
+ // For vector::Extract Op, infer source layout from result layout using
+ // shapes.
+ if (auto extract = dyn_cast<vector::ExtractOp>(op)) {
+ VectorType srcVecTy = dyn_cast<VectorType>(extract.getSource().getType());
+ VectorType resVecTy = dyn_cast<VectorType>(extract.getResult().getType());
+ if (!srcVecTy || !resVecTy)
+ return nullptr;
+ return xegpu::inferExtractSourceLayout(resLayout, resVecTy.getShape(),
+ srcVecTy.getShape());
+ }
+
// For vector::TransposeOp, infer source layout from result layout using
// permutation.
if (auto transpose = dyn_cast<vector::TransposeOp>(op)) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 8d65ea497ad88..057c9b80926a5 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -9,7 +9,7 @@ gpu.module @xevm_module {
gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<16x16xf16>
gpu.return
}
@@ -19,7 +19,7 @@ gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0, shape : [256, 256], strides : [256, 1] : ui64
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<16x16xf16>
gpu.return
}
@@ -29,9 +29,9 @@ gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
gpu.func @load_nd() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%anchor = xegpu.convert_layout %1
<{
input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -46,9 +46,9 @@ gpu.func @load_nd() {
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
gpu.func @load_nd_packed() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+ : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
gpu.return
}
@@ -58,9 +58,9 @@ gpu.func @load_nd_packed() {
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf32> to vector<1x8xf32>
gpu.func @load_nd_transpose() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x8xf32>
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xf32>
+ : !xegpu.tensor_desc<16x8xf32> -> vector<16x8xf32>
gpu.return
}
@@ -70,9 +70,9 @@ gpu.func @load_nd_transpose() {
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<64xf16> to vector<2x32x1xf16>
gpu.func @load_nd_array_length() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x32x16xf16>
+ : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<2x32x16xf16>
gpu.return
}
@@ -84,12 +84,12 @@ gpu.func @load_nd_array_length() {
// CHECK: xegpu.store_nd %[[CAST3]], %{{.*}}[%[[C0]], %[[C0]]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.func @store_nd() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
+ %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
%2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
xegpu.store_nd %2, %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.return
}
@@ -108,14 +108,13 @@ gpu.func @store_nd() {
// CHECK: gpu.return
gpu.func @dpas() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- %5 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<0.0> : vector<8x16xf32>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16>
+ %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
+ %5 = arith.constant dense<0.0> : vector<8x16xf32>
%2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+ : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%3 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+ : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%4 = xegpu.dpas %2, %3, %5
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -138,13 +137,11 @@ gpu.func @dpas() {
// CHECK: gpu.return
gpu.func @elementwise() {
%c0 = arith.constant 0 : index
- %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<1.0> : vector<16x16xf32>
- %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %0 = arith.constant dense<1.0> : vector<16x16xf32>
+ %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32>
%2 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
+ : !xegpu.tensor_desc<16x16xf32> -> vector<16x16xf32>
%3 = arith.addf %0, %2
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<16x16xf32>
%cl3 = xegpu.convert_layout %3
<{
@@ -158,8 +155,7 @@ gpu.func @elementwise() {
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf32>
// CHECK: gpu.return
gpu.func @arith_constant() {
- %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<1.0> : vector<16x16xf32>
+ %0 = arith.constant dense<1.0> : vector<16x16xf32>
%cl0 = xegpu.convert_layout %0
<{
input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -174,9 +170,9 @@ gpu.func @arith_constant() {
// CHECK: gpu.return
gpu.func @prefetch_nd() {
%c0 = arith.constant 0 : index
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
xegpu.prefetch_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<16x16xf16>
gpu.return
}
@@ -187,12 +183,8 @@ gpu.func @prefetch_nd() {
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf16> to vector<1x8xf16>
gpu.func @scatter_load_chunksize(%src: memref<256xf16>) {
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<16xindex>
- %mask = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<true> : vector<16xi1>
+ %offset = arith.constant dense<12> : vector<16xindex>
+ %mask = arith.constant dense<true> : vector<16xi1>
%0 = xegpu.load %src[%offset], %mask
<{chunk_size = 8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
@@ -209,12 +201,8 @@ gpu.func @scatter_load_chunksize(%src: memref<256xf16>) {
// CHECK: xegpu.store %[[C2]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
gpu.func @scatter_store_chunksize(%src: memref<256xf16>) {
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<16xindex>
- %mask = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<true> : vector<16xi1>
+ %offset = arith.constant dense<12> : vector<16xindex>
+ %mask = arith.constant dense<true> : vector<16xi1>
%0 = xegpu.load %src[%offset], %mask
<{chunk_size = 8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}>
: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
@@ -230,12 +218,8 @@ gpu.func @scatter_store_chunksize(%src: memref<256xf16>) {
// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]]
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
gpu.func @scatter_load(%src: memref<256xf16>) {
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<16xindex>
- %mask = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<true> : vector<16xi1>
+ %offset = arith.constant dense<12> : vector<16xindex>
+ %mask = arith.constant dense<true> : vector<16xi1>
%0 = xegpu.load %src[%offset], %mask
<{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
@@ -250,12 +234,8 @@ gpu.func @scatter_load(%src: memref<256xf16>) {
// CHECK: xegpu.store %[[LOAD]], %arg0[%[[OFFSET]]], %[[MASK]]
// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
gpu.func @scatter_store(%src: memref<256xf16>) {
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<16xindex>
- %mask = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<true> : vector<16xi1>
+ %offset = arith.constant dense<12> : vector<16xindex>
+ %mask = arith.constant dense<true> : vector<16xi1>
%0 = xegpu.load %src[%offset], %mask
<{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
@@ -279,12 +259,8 @@ gpu.func @scatter_store(%src: memref<256xf16>) {
// CHECK: xegpu.store %[[CAST2]], %arg0[%[[V3]]], %[[V4]]
// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>) {
- %mask = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- dense<1> : vector<1x1x16xi1>
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- dense<12> : vector<1x1x16xindex>
+ %mask = arith.constant dense<1> : vector<1x1x16xi1>
+ %offset = arith.constant dense<12> : vector<1x1x16xindex>
%0 = xegpu.load %src[%offset], %mask
<{layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}>
: memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16>
@@ -316,8 +292,13 @@ gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>) {
// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD4]], %[[CST]] : f32
gpu.func @vector_reduction() {
%acc = arith.constant 1.0 : f32
- %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : () -> vector<32xf32>
+ %0 = "some_op"() : () -> vector<32xf32>
%2 = vector.reduction <add>, %0, %acc : vector<32xf32> into f32
+ %anchor = xegpu.convert_layout %2
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>,
+ target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>
+ }> : f32
gpu.return
}
@@ -372,16 +353,9 @@ gpu.func @vector_reduction() {
// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<0.0> : vector<2x16xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
- dense<0.0> : vector<2xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
- }
+ %src = arith.constant dense<0.0> : vector<2x16xf32>
+ %acc = arith.constant dense<0.0> : vector<2xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
[1] : vector<2x16xf32> to vector<2xf32>
%cl1 = xegpu.convert_layout %1
<{
@@ -442,16 +416,9 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
// CHECK: %[[V19:.*]] = vector.insert %[[V18]], %[[V9]] [1] : f32 into vector<2xf32>
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- dense<0.0> : vector<16x2xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
- dense<0.0> : vector<2xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
- }
+ %src = arith.constant dense<0.0> : vector<16x2xf32>
+ %acc = arith.constant dense<0.0> : vector<2xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
[0] : vector<16x2xf32> to vector<2xf32>
%cl1 = xegpu.convert_layout %1
<{
@@ -472,16 +439,9 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
// CHECK: gpu.return
gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<0.0> : vector<4x16xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
- dense<0.0> : vector<16xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
- }
+ %src = arith.constant dense<0.0> : vector<4x16xf32>
+ %acc = arith.constant dense<0.0> : vector<16xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
[0] : vector<4x16xf32> to vector<16xf32>
%cl1 = xegpu.convert_layout %1
<{
@@ -502,16 +462,9 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
// CHECK: gpu.return
gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- dense<0.0> : vector<16x12xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
- dense<0.0> : vector<16xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
- }
+ %src = arith.constant dense<0.0> : vector<16x12xf32>
+ %acc = arith.constant dense<0.0> : vector<16xf32>
+ %1 = vector.multi_reduction <add>, %src, %acc
[1] : vector<16x12xf32> to vector<16xf32>
%cl1 = xegpu.convert_layout %1
<{
@@ -528,12 +481,8 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
// CHECK-NEXT: gpu.return
gpu.func @vector_transpose() {
%cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
: () -> (vector<16x2xf32>)
%transpose = vector.transpose %cst, [1, 0]
- {
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
: vector<16x2xf32> to vector<2x16xf32>
%transpose2 = xegpu.convert_layout %transpose
<{
@@ -549,14 +498,13 @@ gpu.func @vector_transpose() {
// CHECK-NEXT: %[[BC:.*]] = vector.bitcast %[[CAST]] : vector<4x2xi8> to vector<4x1xi16>
// CHECK-NEXT: gpu.return
gpu.func @vector_bitcast() {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
- : () -> (vector<4x32xi8>)
- %bitcast = vector.bitcast %cst
- {
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<4x32xi8> to vector<4x16xi16>
+ %cst = "some_op"() : () -> (vector<4x32xi8>)
+ %bitcast = vector.bitcast %cst : vector<4x32xi8> to vector<4x16xi16>
+ %anchor = xegpu.convert_layout %bitcast
+ <{
+ input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<4x16xi16>
gpu.return
}
@@ -570,7 +518,6 @@ gpu.func @vector_bitcast() {
// CHECK: gpu.return
gpu.func @create_mask_1d(%m0: index) {
%mask = vector.create_mask %m0
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: vector<16xi1>
%mask_cl = xegpu.convert_layout %mask
<{
@@ -590,7 +537,6 @@ gpu.func @create_mask_1d(%m0: index) {
// CHECK: gpu.return
gpu.func @constant_mask_1d() {
%mask = vector.constant_mask [4]
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: vector<16xi1>
%mask_cl = xegpu.convert_layout %mask
<{
@@ -616,7 +562,6 @@ gpu.func @constant_mask_1d() {
// CHECK: gpu.return
gpu.func @create_mask_2d(%m0: index, %m1: index) {
%mask = vector.create_mask %m0, %m1
- {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
: vector<8x4xi1>
%mask_cl = xegpu.convert_layout %mask
<{
@@ -643,7 +588,6 @@ gpu.func @create_mask_2d(%m0: index, %m1: index) {
// CHECK: gpu.return
gpu.func @constant_mask_2d() {
%mask = vector.constant_mask [2, 3]
- {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
: vector<8x4xi1>
%mask_cl = xegpu.convert_layout %mask
<{
@@ -653,7 +597,6 @@ gpu.func @constant_mask_2d() {
gpu.return
}
-
// CHECK-LABEL: gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local
// CHECK-DAG: %[[SRC:.*]] = arith.constant dense<0.000000e+00> : vector<1x16x2xf32>
// CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<1x2xf32>
@@ -669,16 +612,9 @@ gpu.func @constant_mask_2d() {
// CHECK: vector.insert %[[R1]], %[[I0]] [0, 1] : f32 into vector<1x2xf32>
// CHECK: gpu.return
gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local() {
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- dense<0.0> : vector<1x16x32xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>}
- dense<0.0> : vector<1x32xf32>
+ %src = arith.constant dense<0.0> : vector<1x16x32xf32>
+ %acc = arith.constant dense<0.0> : vector<1x32xf32>
%1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>
- }
[1] : vector<1x16x32xf32> to vector<1x32xf32>
%cl1 = xegpu.convert_layout %1
<{
@@ -707,16 +643,9 @@ gpu.func @vector_multi_reduction_3d_leading_unit_dim_lane_local() {
// CHECK: vector.insert %[[WITH_ACC1]], %[[INS0]] [0, 1] : f32 into vector<1x2xf32>
// CHECK: gpu.return
gpu.func @vector_multi_reduction_3d_leading_unit_dim_cross_lane() {
- %src = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16, 1], lane_data = [1, 1, 1]>}
- dense<0.0> : vector<1x16x2xf32>
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16, 1], lane_data = [1, 1, 1]>, dims = [1]>}
- dense<0.0> : vector<1x2xf32>
+ %src = arith.constant dense<0.0> : vector<1x16x2xf32>
+ %acc = arith.constant dense<0.0> : vector<1x2xf32>
%1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16, 1], lane_data = [1, 1, 1]>, dims = [1]>
- }
[1] : vector<1x16x2xf32> to vector<1x2xf32>
%cl1 = xegpu.convert_layout %1
<{
@@ -729,11 +658,8 @@ gpu.func @vector_multi_reduction_3d_leading_unit_dim_cross_lane() {
// CHECK-LABEL: gpu.func @vector_extract_from_2d
// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[0] : vector<1xf32> from vector<4x1xf32>
gpu.func @vector_extract_from_2d() {
- %src = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> vector<4x16xf32>
+ %src = "some_op"() : () -> vector<4x16xf32>
%0 = vector.extract %src[0]
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: vector<16xf32> from vector<4x16xf32>
%cl0 = xegpu.convert_layout %0
<{
@@ -747,10 +673,8 @@ gpu.func @vector_extract_from_2d() {
// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[2] : vector<1xf32> from vector<8x1xf32>
gpu.func @vector_extract_from_2d_offset2() {
%src = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<8x16xf32>
%0 = vector.extract %src[2]
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: vector<16xf32> from vector<8x16xf32>
%cl0 = xegpu.convert_layout %0
<{
@@ -764,13 +688,10 @@ gpu.func @vector_extract_from_2d_offset2() {
// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[0] : vector<1xf32> into vector<4x1xf32>
gpu.func @vector_insert_into_2d() {
%val = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<16xf32>
%dst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<4x16xf32>
%0 = vector.insert %val, %dst[0]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<16xf32> into vector<4x16xf32>
%cl0 = xegpu.convert_layout %0
<{
@@ -784,13 +705,10 @@ gpu.func @vector_insert_into_2d() {
// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[2] : vector<1xf32> into vector<8x1xf32>
gpu.func @vector_insert_into_2d_offset2() {
%val = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<16xf32>
%dst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<8x16xf32>
%0 = vector.insert %val, %dst[2]
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<16xf32> into vector<8x16xf32>
%cl0 = xegpu.convert_layout %0
<{
@@ -804,10 +722,8 @@ gpu.func @vector_insert_into_2d_offset2() {
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<24x16xf32>
- %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1],
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1]
}
: vector<24x16xf32> to vector<8x16xf32>
%cl1 = xegpu.convert_layout %1
@@ -822,10 +738,8 @@ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_inner_distributed() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<24x64xf32>
- %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1]
}
: vector<24x64xf32> to vector<8x16xf32>
%cl1 = xegpu.convert_layout %1
@@ -840,10 +754,8 @@ gpu.func @vector_extract_strided_slice_inner_distributed() {
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
gpu.func @vector_extract_strided_slice_outer_distributed() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
: () -> vector<32x16xf32>
- %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+ %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1]
}
: vector<32x16xf32> to vector<16x16xf32>
%cl1 = xegpu.convert_layout %1
@@ -858,10 +770,8 @@ gpu.func @vector_extract_strided_slice_outer_distributed() {
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
gpu.func @vector_extract_strided_slice_1d() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<64xf32>
- %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1]
}
: vector<64xf32> to vector<32xf32>
%cl1 = xegpu.convert_layout %1
@@ -876,10 +786,8 @@ gpu.func @vector_extract_strided_slice_1d() {
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_partial_offsets() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<24x16xf32>
- %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1],
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1]
}
: vector<24x16xf32> to vector<8x16xf32>
%cl1 = xegpu.convert_layout %1
@@ -894,13 +802,10 @@ gpu.func @vector_extract_strided_slice_partial_offsets() {
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<16x16xf32>
%1 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<64x16xf32>
- %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1]
}
: vector<16x16xf32> into vector<64x16xf32>
%cl2 = xegpu.convert_layout %2
@@ -915,13 +820,10 @@ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
gpu.func @vector_insert_strided_slice_inner_distributed() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<16x16xf32>
%1 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<64x32xf32>
- %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1],
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1]
}
: vector<16x16xf32> into vector<64x32xf32>
%cl2 = xegpu.convert_layout %2
@@ -936,13 +838,10 @@ gpu.func @vector_insert_strided_slice_inner_distributed() {
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
gpu.func @vector_insert_strided_slice_outer_distributed() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
: () -> vector<16x16xf32>
%1 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
: () -> vector<48x32xf32>
- %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1],
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1]
}
: vector<16x16xf32> into vector<48x32xf32>
%cl2 = xegpu.convert_layout %2
@@ -957,13 +856,10 @@ gpu.func @vector_insert_strided_slice_outer_distributed() {
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
gpu.func @vector_insert_strided_slice_1d() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<16xf32>
%1 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<48xf32>
- %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1]
}
: vector<16xf32> into vector<48xf32>
%cl2 = xegpu.convert_layout %2
@@ -978,13 +874,10 @@ gpu.func @vector_insert_strided_slice_1d() {
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
gpu.func @vector_insert_strided_slice_different_ranks() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<16xf32>
%1 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<64x16xf32>
- %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1],
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1]
}
: vector<16xf32> into vector<64x16xf32>
%cl2 = xegpu.convert_layout %2
@@ -999,10 +892,8 @@ gpu.func @vector_insert_strided_slice_different_ranks() {
// CHECK-NOT: xegpu.convert_layout
gpu.func @convert_layout_removed_when_compatible() {
%0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<16xf32>
%2 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>}
: () -> vector<1xf32>
%1 = xegpu.convert_layout %0
<{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
@@ -1106,9 +997,9 @@ gpu.module @xevm_module {
// CHECK: %[[NEG:.*]] = arith.negf %[[SRC]] : vector<16x1xf16>
// CHECK: gpu.return
gpu.func @elementwise_wrap_around_dim() {
- %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ %0 = "some_op"()
: () -> vector<16x1xf16>
- %1 = arith.negf %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ %1 = arith.negf %0
: vector<16x1xf16>
%cl1 = xegpu.convert_layout %1
<{
@@ -1128,7 +1019,7 @@ gpu.module @xevm_module {
// CHECK: %[[REM2:.*]] = arith.remui %[[REM]], %[[C16]]{{.*}} : index
// CHECK: %[[VEC:.*]] = vector.from_elements %[[REM2]] : vector<1xindex>
gpu.func @vector_step_slice() {
- %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex>
+ %0 = vector.step : vector<16xindex>
%cl0 = xegpu.convert_layout %0
<{
input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>,
@@ -1143,7 +1034,7 @@ gpu.module @xevm_module {
// CHECK-LABEL: gpu.func @vector_step_slice_unit
// CHECK: %[[VEC:.*]] = vector.from_elements %{{.*}} : vector<1xindex>
gpu.func @vector_step_slice_unit() {
- %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex>
+ %0 = vector.step : vector<1xindex>
%cl0 = xegpu.convert_layout %0
<{
input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>,
@@ -1165,7 +1056,7 @@ gpu.module @xevm_module {
// CHECK: %[[V3:.*]] = arith.addi %[[V2]], %{{.*}} : index
// CHECK: %[[VEC:.*]] = vector.from_elements %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<4xindex>
gpu.func @vector_step_slice_multi_dist() {
- %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1, 2, 1]>, dims = [0, 2]>} : vector<16xindex>
+ %0 = vector.step : vector<16xindex>
%cl0 = xegpu.convert_layout %0
<{
input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1, 2, 1]>, dims = [0, 2]>,
@@ -1181,12 +1072,8 @@ gpu.module @xevm_module {
// CHECK: %[[SC:.*]] = vector.shape_cast %{{.*}} : vector<1xf32> to vector<1x1xf32>
gpu.func @vector_shapecast_rank_increasing() {
%cst = "some_op"()
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
: () -> (vector<16xf32>)
%cast = vector.shape_cast %cst
- {
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
: vector<16xf32> to vector<1x16xf32>
%cast_cl = xegpu.convert_layout %cast
<{
@@ -1203,12 +1090,8 @@ gpu.module @xevm_module {
// CHECK: %[[SC:.*]] = vector.shape_cast %{{.*}} : vector<1x1xf32> to vector<1xf32>
gpu.func @vector_shapecast_rank_reducing() {
%cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> (vector<1x16xf32>)
%cast = vector.shape_cast %cst
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
- }
: vector<1x16xf32> to vector<16xf32>
%cast_cl = xegpu.convert_layout %cast
<{
@@ -1225,12 +1108,8 @@ gpu.module @xevm_module {
// CHECK: %[[SC:.*]] = vector.shape_cast %{{.*}} : vector<1xf32> to vector<1x1xf32>
gpu.func @vector_shapecast_rank_increasing_without_slicing_layout() {
%cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> (vector<16xf32>)
%cast = vector.shape_cast %cst
- {
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
: vector<16xf32> to vector<1x16xf32>
%cast_cl = xegpu.convert_layout %cast
<{
@@ -1248,9 +1127,13 @@ gpu.module @xevm_module {
// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16xf16> to vector<1xf16>
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<1xf16> to vector<16x1xf16>
gpu.func @vector_broadcast_1d_to_2d(%laneid: index) {
- %0 = "some_op"() {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : () -> vector<16xf16>
- %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
- "some_use"(%1) : (vector<16x16xf16>) -> ()
+ %0 = "some_op"() : () -> vector<16xf16>
+ %1 = vector.broadcast %0 : vector<16xf16> to vector<16x16xf16>
+ %anchor = xegpu.convert_layout %1
+ <{
+ input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }> : vector<16x16xf16>
gpu.return
}
}
@@ -1261,8 +1144,7 @@ gpu.module @xevm_module {
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf16>
// CHECK: gpu.return
gpu.func @constant_wrap_around_dim() {
- %0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- dense<1.0> : vector<16x1xf16>
+ %0 = arith.constant dense<1.0> : vector<16x1xf16>
%cl0 = xegpu.convert_layout %0
<{
input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -1279,9 +1161,14 @@ gpu.module @xevm_module {
// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x16xf16> to vector<16x1xf16>
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<16x1xf16> to vector<1x16x1xf16>
gpu.func @vector_broadcast_2d_to_3d(%laneid: index) {
- %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x16xf16>
- %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} : vector<16x16xf16> to vector<1x16x16xf16>
- "some_use"(%1) : (vector<1x16x16xf16>) -> ()
+ %0 = "some_op"() : () -> vector<16x16xf16>
+ %1 = vector.broadcast %0 : vector<16x16xf16> to vector<1x16x16xf16>
+ %2 = xegpu.convert_layout %1
+ <{
+ input_layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>,
+ target_layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
+ }> : vector<1x16x16xf16>
+ "some_use"(%2) : (vector<1x16x16xf16>) -> ()
gpu.return
}
}
@@ -1292,8 +1179,8 @@ gpu.module @xevm_module {
// CHECK: %[[SRC:.*]] = "some_op"()
// CHECK-NOT: vector.broadcast
gpu.func @vector_broadcast_2d_to_2d_noop(%laneid: index) {
- %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x1xf16>
- %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+ %0 = "some_op"() : () -> vector<16x1xf16>
+ %1 = vector.broadcast %0 : vector<16x1xf16> to vector<16x16xf16>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
"some_use"(%2) : (vector<16x16xf16>) -> ()
gpu.return
@@ -1308,8 +1195,9 @@ gpu.module @xevm_module {
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[SRC]] : f16 to vector<16x1xf16>
gpu.func @vector_broadcast_scalar_to_vector(%laneid: index) {
%0 = "some_op"() : () -> f16
- %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
- "some_use"(%1) : (vector<16x16xf16>) -> ()
+ %1 = vector.broadcast %0 : f16 to vector<16x16xf16>
+ %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
+ "some_use"(%2) : (vector<16x16xf16>) -> ()
gpu.return
}
}
@@ -1349,13 +1237,9 @@ gpu.module @xevm_module {
// CHECK: %[[FINAL:.*]] = arith.addf %[[ADD5]], %[[ACC]] : f32
gpu.func @vector_multi_reduction_1d_to_scalar() {
%src = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<32xf32>
%acc = arith.constant 0.0 : f32
%1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>
- }
[0] : vector<32xf32> to f32
%cl1 = xegpu.convert_layout %1
<{
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index e02bd9b0370ad..952d54a43ae38 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -503,12 +503,16 @@ gpu.module @xevm_module {
%2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
%3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
%4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
- %5 = vector.broadcast %4 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
- %cst_0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
- %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+ %anchor = xegpu.convert_layout %4
+ <{
+ input_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims=[0]>,
+ target_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims=[0]>
+ }>
+ : f32
+ %5 = vector.broadcast %anchor : f32 to vector<16xf32>
+ %cst_0 = arith.constant dense<0> : vector<16xindex>
+ %cst_1 = arith.constant dense<true> : vector<16xi1>
xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
gpu.return
}
}
-
-
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index c2aac8fa6cf0b..cd7f8b9f69ff2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -292,11 +292,11 @@ gpu.module @test_kernel {
%m = arith.muli %block_id_x, %c32 : index
%0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
%1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32>
- %11 = vector.shape_cast %1 {layout_result_0 = #l} : vector<32xf32> to vector<32x1xf32>
+ %11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32>
// CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32>
- %2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32>
+ %2 = vector.broadcast %11 : vector<32x1xf32> to vector<32x64xf32>
%3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l>
- xegpu.store_nd %2, %3[0, 0] : vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l>
+ xegpu.store_nd %2, %3[0, 0] {layout = #l} : vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l>
gpu.return
}
}
@@ -480,7 +480,8 @@ gpu.module @test_kernel {
gpu.func @convert_layout(%B: vector<8x32x2xf16>) -> vector<8x32x2xf16> {
%b = xegpu.convert_layout %B <{input_layout = #lb, target_layout = #b}> : vector<8x32x2xf16>
- %e = math.exp %b {layout_result_0 = #b} : vector<8x32x2xf16>
+ %e = math.exp %b : vector<8x32x2xf16>
+ %anchor = xegpu.convert_layout %e <{input_layout = #b, target_layout = #b}> : vector<8x32x2xf16>
gpu.return %e : vector<8x32x2xf16>
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index fefe2091d458d..2f43f9a840173 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -125,7 +125,12 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[T15:.*]] = arith.addi %[[T13]], %[[T14]] : index
// CHECK-DAG: %[[T16:.*]] = vector.broadcast %[[T15]] : index to vector<2x1xindex>
// CHECK-DAG: %[[T17:.*]] = arith.addi %[[CST]], %[[T16]] : vector<2x1xindex>
- %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+ %cst_2 = arith.constant dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+ %anchor = xegpu.convert_layout %cst_2
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>
+ }> : vector<32x1xindex>
gpu.return
}
@@ -250,7 +255,12 @@ gpu.module @test_distribution {
// CHECK-LABEL: splat_constant
gpu.func @splat_constant() {
// CHECK-COUNT-2: %[[CST:.*]] = arith.constant dense<0> : vector<4xindex>
- %cst_2 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4], order = [0, 1]>, dims = [0]>} dense<0> : vector<8xindex>
+ %cst_2 = arith.constant dense<0> : vector<8xindex>
+ %anchor = xegpu.convert_layout %cst_2
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4], order = [0, 1]>, dims = [0]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4], order = [0, 1]>, dims = [0]>
+ }> : vector<8xindex>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index c9aff190d84d7..f9697d83baf58 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -519,7 +519,12 @@ gpu.module @test_distribution {
// CHECK-LABEL: constant_with_slice_attr
gpu.func @constant_with_slice_attr() {
//CHECK: [[cst:%.+]] = arith.constant dense<10> : vector<1xindex>
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>} dense<10> : vector<4xindex>
+ %cst = arith.constant dense<10> : vector<4xindex>
+ %anchor = xegpu.convert_layout %cst
+ <{
+ input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>,
+ target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [1, 2, 3]>
+ }> : vector<4xindex>
gpu.return
}
@@ -582,6 +587,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex>
// CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex>
%cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+ %anchor = xegpu.convert_layout %cst
+ <{
+ input_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>,
+ target_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>
+ }> : vector<32x1xindex>
gpu.return
}
@@ -602,7 +612,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index
// CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex>
// CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex>
- %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[
+ %cst_8x8 = arith.constant dense<[
[0, 16, 32, 48, 64, 80, 96, 112],
[8, 24, 40, 56, 72, 88, 104, 120],
[16, 32, 48, 64, 80, 96, 112, 128],
@@ -612,6 +622,11 @@ gpu.module @test_distribution {
[48, 64, 80, 96, 112, 128, 144, 160],
[56, 72, 88, 104, 120, 136, 152, 168]
]> : vector<8x8xindex>
+ %anchor = xegpu.convert_layout %cst_8x8
+ <{
+ input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>,
+ target_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>
+ }> : vector<8x8xindex>
gpu.return
}
@@ -625,9 +640,19 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index
// CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex>
// CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex>
- %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
+ %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
+ %anchor = xegpu.convert_layout %cst
+ <{
+ input_layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>,
+ target_layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>
+ }> : vector<32xindex>
// CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex>
- %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+ %cst_1 = arith.constant dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+ %anchor_1 = xegpu.convert_layout %cst_1
+ <{
+ input_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>,
+ target_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>
+ }> : vector<1x16xindex>
gpu.return
}
@@ -1232,7 +1257,12 @@ gpu.module @test_distribution {
// CHECK-LABEL: distribute_constant
gpu.func @distribute_constant() {
// CHECK: arith.constant dense<1.000000e+00> : vector<32x32xf32>
- %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} dense<1.0> : vector<256x128xf32>
+ %cst = arith.constant dense<1.0> : vector<256x128xf32>
+ %anchor = xegpu.convert_layout %cst
+ <{
+ input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
+ target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>
+ }> : vector<256x128xf32>
gpu.return
}
>From 707d1d4ae18cefc50c6876015c9a226affa3b2bf Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 19:43:57 +0000
Subject: [PATCH 04/11] fix tests
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 15 +-
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 7 +
.../XeGPU/propagate-layout-inst-data.mlir | 6 +-
.../XeGPU/propagate-layout-subgroup.mlir | 4 +-
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 4 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 5 -
.../Dialect/XeGPU/sg-to-wi-experimental.mlir | 166 ++++++++----------
7 files changed, 101 insertions(+), 106 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4fe15c625ea49..da48bff7b5048 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -419,11 +419,20 @@ LogicalResult LoadNdOp::verify() {
}
}
+ // Handle array_length. Two result shape conventions are accepted:
+ // * Legacy: leading array_length dimension prepended, e.g. descriptor
+ // 16x16 with array_length=2 -> [2, 16, 16].
+ // * Stacked 2D: array blocks stacked along the non-FCD (first) dimension,
+ // e.g. descriptor 16x16 with array_length=2 -> [32, 16].
auto array_len = tdescTy.getArrayLength();
- if (array_len > 1)
- tdescShape.insert(tdescShape.begin(), array_len);
+ SmallVector<int64_t> stackedShape(tdescShape);
+ SmallVector<int64_t> prependedShape(tdescShape);
+ if (array_len > 1 && !tdescShape.empty()) {
+ stackedShape[0] *= array_len;
+ prependedShape.insert(prependedShape.begin(), array_len);
+ }
- if (tdescShape != valueShape)
+ if (valueShape != stackedShape && valueShape != prependedShape)
return emitOpError() << "Result shape " << makeString(valueShape)
<< " is not consistent with tensor descriptor "
<< tdescTy;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index a5776ebce2e95..1a3bc28cec002 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1403,6 +1403,9 @@ struct ResolveLayoutConflicts {
} // namespace
LogicalResult ResolveLayoutConflicts::run() {
+ // dump the IR before resolving layout conflicts for debugging purposes.
+ DBGS() << "IR before resolving layout conflicts:\n";
+ parentOp->dump();
// Scan all operations in the parent op and resolve layout conflicts at
// tensor descriptor and vector use points.
auto r = parentOp->walk([&](Operation *op) -> WalkResult {
@@ -1445,6 +1448,10 @@ LogicalResult ResolveLayoutConflicts::run() {
return WalkResult::advance();
});
+ // dump the IR after resolving layout conflicts for debugging purposes.
+ DBGS() << "IR after resolving layout conflicts:\n";
+ parentOp->dump();
+
return r.wasInterrupted() ? failure() : success();
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 6448db93c3f40..0d73985502e3f 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -248,8 +248,8 @@ func.func @insert_strided_slice_inst_data_with_packing(%arg0: memref<8x64xi8>) {
%cst_small = arith.constant dense<1> : vector<4x64xi8>
%cst_large = arith.constant dense<0> : vector<8x64xi8>
%insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<4x64xi8> into vector<8x64xi8>
- %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8, #xegpu.layout<inst_data = [8, 64]>>
- xegpu.store_nd %insert, %tdesc[0, 0] <{layout = #xegpu.layout<inst_data = [8, 64]>}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8, #xegpu.layout<inst_data = [8, 64]>>
+ %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8>
+ xegpu.store_nd %insert, %tdesc[0, 0] <{layout = #xegpu.layout<inst_data = [8, 64]>}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8>
return
}
}
@@ -333,7 +333,7 @@ func.func @vector_shape_cast_expand_and_merge(%arg0: memref<256xf16>, %arg1: mem
%4 = vector.shape_cast %2 : vector<2x4x32xf16> to vector<1x256xf16>
%5 = vector.shape_cast %4 : vector<1x256xf16> to vector<256xf16>
- xegpu.store %5, %arg1[%0], %cst <{layout = #xegpu.layout<inst_data = [32] >}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
+ xegpu.store %5, %arg1[%0], %cst <{layout = #xegpu.layout<inst_data = [32]>}> : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
return
}
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index d4ad9087149c1..2c28073ee1c01 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -133,9 +133,9 @@ gpu.module @test {
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
%load = xegpu.load_nd %tdesc_src[0, 0] : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
%reduce = vector.multi_reduction <add>, %load, %cst [1] : vector<32x64xf32> to vector<32xf32>
- %tdesc_dst = xegpu.create_nd_tdesc %dst : memref<32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<sg_layout = [32], sg_data = [1]>>
+ %tdesc_dst = xegpu.create_nd_tdesc %dst : memref<32xf32> -> !xegpu.tensor_desc<32xf32>
xegpu.store_nd %reduce, %tdesc_dst[0] <{layout = #xegpu.layout<sg_layout = [32], sg_data = [1]>}>
- : vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.layout<sg_layout = [32], sg_data = [1]>>
+ : vector<32xf32>, !xegpu.tensor_desc<32xf32>
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 72d066d516540..8c1f85435c771 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -960,8 +960,8 @@ func.func @insert_strided_slice_with_slice_layout(%arg0: memref<8x16xf32>) {
%cst_small8 = vector.extract_strided_slice %cst_large_new {offsets = [0], sizes = [8], strides = [1]} : vector<16xf32> to vector<8xf32>
%cst_small16x8 = vector.broadcast %cst_small8 : vector<8xf32> to vector<16x8xf32>
%cst_small8x16 = vector.transpose %cst_small16x8, [1, 0] : vector<16x8xf32> to vector<8x16xf32>
- %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %cst_small8x16, %tdesc[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %cst_small8x16, %tdesc[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
return
}
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 057c9b80926a5..d018c32bca694 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -32,11 +32,6 @@ gpu.func @load_nd() {
%0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16>
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %anchor = xegpu.convert_layout %1
- <{
- input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }> : vector<16x16xf16>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index 952d54a43ae38..ec553aa33f49b 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -37,37 +37,33 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
%0 = arith.muli %block_id_x, %c8 : index
%1 = arith.muli %block_id_y, %c16 : index
%2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
%3 = xegpu.load_nd %2[%0, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
%5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xbf16>
%6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ -> !xegpu.tensor_desc<16x16xbf16>
%4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
%7 = xegpu.load_nd %5[%0, %arg3]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
%8 = xegpu.load_nd %6[%arg3, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
%9 = xegpu.dpas %7, %8, %arg4
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
scf.yield %9 : vector<8x16xf32>
- } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ }
xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
gpu.return
}
@@ -105,44 +101,40 @@ gpu.func @gemm_with_preop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c1024 = arith.constant 1024 : index
- %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.0> : vector<8x16xbf16>
+ %cst = arith.constant dense<1.0> : vector<8x16xbf16>
%block_id_x = gpu.block_id x
%block_id_y = gpu.block_id y
%0 = arith.muli %block_id_x, %c8 : index
%1 = arith.muli %block_id_y, %c16 : index
%2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
%3 = xegpu.load_nd %2[%0, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
%5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xbf16>
%6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ -> !xegpu.tensor_desc<16x16xbf16>
%4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
%7 = xegpu.load_nd %5[%0, %arg3]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
- %preop = arith.addf %7, %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
+ %preop = arith.addf %7, %cst : vector<8x16xbf16>
%8 = xegpu.load_nd %6[%arg3, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
%9 = xegpu.dpas %preop, %8, %arg4
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
scf.yield %9 : vector<8x16xf32>
- } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ }
xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
gpu.return
}
@@ -181,38 +173,34 @@ gpu.func @gemm_with_postop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x102
%0 = arith.muli %block_id_x, %c8 : index
%1 = arith.muli %block_id_y, %c16 : index
%2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
%3 = xegpu.load_nd %2[%0, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
%5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xbf16>
%6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ -> !xegpu.tensor_desc<16x16xbf16>
%4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
%7 = xegpu.load_nd %5[%0, %arg3]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xbf16> -> vector<8x16xbf16>
%8 = xegpu.load_nd %6[%arg3, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ : !xegpu.tensor_desc<16x16xbf16> -> vector<16x16xbf16>
%9 = xegpu.dpas %7, %8, %arg4
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
scf.yield %9 : vector<8x16xf32>
- } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- %postop = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
+ }
+ %postop = math.exp %4 : vector<8x16xf32>
xegpu.store_nd %postop, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
gpu.return
}
}
@@ -237,15 +225,15 @@ gpu.module @xevm_module{
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.0> : vector<8x16xf32>
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
- -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.load_nd %0[%c0, %c0]
{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
- !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+ !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ -> !xegpu.tensor_desc<16x16xf16>
%3 = xegpu.load_nd %2[%c0, %c0]
{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ : !xegpu.tensor_desc<16x16xf16>
-> vector<16x16xf16>
%4 = xegpu.dpas %1, %3, %cst
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -253,17 +241,16 @@ gpu.module @xevm_module{
layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
%5 = math.exp %4
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf32>
%6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
%anchor = xegpu.convert_layout %5
<{
input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}> : vector<8x16xf32>
xegpu.store_nd %anchor, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
gpu.return
}
}
@@ -287,8 +274,8 @@ gpu.module @xevm_module{
// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
gpu.module @xevm_module{
gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
- %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+ %1 = arith.constant dense<1>: vector<16xi1>
+ %offset = arith.constant dense<12> : vector<16xindex>
%loaded = scf.if %pred -> (vector<16x8xf16>) {
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
@@ -345,33 +332,30 @@ gpu.module @xevm_module{
gpu.module @xevm_module{
gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
- %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.0> : vector<8x16xf32>
+ %cst = arith.constant dense<0.0> : vector<8x16xf32>
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
- -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.load_nd %0[%c0, %c0]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+ {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
- -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
+ -> !xegpu.tensor_desc<16x8xi32>
%3 = xegpu.load_nd %2[%c0, %c0]
- {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
- : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>> -> vector<16x8xi32>
- %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2], order = [0, 1]>}
+ {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
+ : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32>
+ %4 = vector.bitcast %3
: vector<16x8xi32> to vector<16x16xf16>
- %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+ %5 = vector.transpose %4, [1, 0]
: vector<16x16xf16> to vector<16x16xf16>
%6 = xegpu.dpas %1, %5, %cst
{layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
%7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %6, %7[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32>
gpu.return
}
}
@@ -448,15 +432,15 @@ gpu.module @xevm_module{
gpu.module @xevm_module{
gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
%c0 = arith.constant 0 : index
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+ %cst = arith.constant dense<0.000000e+00> : vector<16xf16>
%tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<16x16xf16>
%tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
- %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
- %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
- xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<16x16xf16>
+ %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %1 = vector.multi_reduction <add>, %0, %cst [0] : vector<16x16xf16> to vector<16xf16>
+ %2 = vector.broadcast %1 : vector<16xf16> to vector<16x16xf16>
+ xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.return
}
}
@@ -470,10 +454,10 @@ gpu.module @xevm_module{
%9 = gpu.block_id x
%10 = arith.index_cast %9 : index to i16
%11 = arith.bitcast %10 : i16 to f16
- %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+ %2 = vector.broadcast %11 : f16 to vector<16x16xf16>
%tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.return
}
}
@@ -497,12 +481,12 @@ gpu.module @xevm_module {
// CHECK: %[[VEC_RED:.*]] = vector.broadcast %{{.*}} : f32 to vector<1xf32>
// CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
- %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
- %2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
- %3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
- %4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+ %cst = arith.constant 1.000000e+00 : f32
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
+ %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32> -> vector<4x16xf32>
+ %2 = vector.broadcast %cst : f32 to vector<16xf32>
+ %3 = vector.multi_reduction <add>, %1, %2 [0] : vector<4x16xf32> to vector<16xf32>
+ %4 = vector.reduction <add>, %3 : vector<16xf32> into f32
%anchor = xegpu.convert_layout %4
<{
input_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims=[0]>,
>From 061a338d82f0d150afd75a867e176d10ddde2112 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 1 May 2026 22:53:59 +0000
Subject: [PATCH 05/11] fix tests
---
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 80 ++++++-------
.../XeGPU/xegpu-wg-to-sg-elemwise.mlir | 110 +++++-------------
.../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 107 +++++++++--------
3 files changed, 125 insertions(+), 172 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index cd7f8b9f69ff2..a9ba4306b3014 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -30,7 +30,7 @@ gpu.module @test_kernel {
%c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
scf.yield %c
: vector<16x32xf32>
- } {layout_result_0 = #c}
+ }
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
@@ -64,10 +64,10 @@ gpu.module @test_kernel {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
%b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
//CHECK-COUNT-8: xegpu.dpas {{.*}}
- %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
scf.yield %c
: vector<16x32xf32>
- } {layout_result_0 = #l1}
+ }
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
gpu.return
@@ -105,10 +105,10 @@ gpu.module @test_kernel {
%a = xegpu.load_nd %a_tdesc[%c0, %a_off] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16>
//CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
%b = xegpu.load_nd %b_tdesc[%a_off, %c0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16>
- %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
+ %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32>
scf.yield %c
: vector<8x32xf32>
- } {layout_result_0 = #l1}
+ }
//CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1>
gpu.return
@@ -143,12 +143,12 @@ gpu.module @test_kernel {
//CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
%b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
//CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
- %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
+ %e = math.exp %a : vector<16x32xf16>
//CHECK-COUNT-8: xegpu.dpas {{.*}}
- %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+ %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
scf.yield %c
: vector<16x32xf32>
- } {layout_result_0 = #c}
+ }
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
gpu.return
@@ -176,7 +176,7 @@ gpu.module @test_kernel {
%b = xegpu.load_nd %b_tdesc[%c0, %k] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16>
//CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16>
- %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16>
+ %c = arith.addf %a, %b : vector<16x32xf16>
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
xegpu.store_nd %c, %c_tdesc[%c0, %k] {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l>
@@ -206,7 +206,7 @@ gpu.module @test_kernel {
%b = xegpu.load_nd %b_tdesc[%k] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16>
//CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16>
- %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16>
+ %c = arith.addf %a, %b : vector<32xf16>
//CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16>
xegpu.store_nd %c, %c_tdesc[%k] {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l>
@@ -220,7 +220,7 @@ gpu.module @test_kernel {
#r = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [0]>
gpu.module @test_kernel {
gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
- %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<64xf32>
+ %acc = arith.constant dense<0.0> : vector<64xf32>
%c64 = arith.constant 64 : index
%block_id_x = gpu.block_id x
%m = arith.muli %block_id_x, %c64 : index
@@ -228,7 +228,7 @@ gpu.module @test_kernel {
%1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32>
// CHECK-COUNT-3: vector.multi_reduction <add>, {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32>
- %2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32>
+ %2 = vector.multi_reduction <add>, %1, %acc [0]: vector<16x64xf32> to vector<64xf32>
%3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
xegpu.store_nd %2, %3[0] {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r>
gpu.return
@@ -242,7 +242,7 @@ gpu.module @test_kernel {
gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
- %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<32xf32>
+ %acc = arith.constant dense<0.0> : vector<32xf32>
%block_id_x = gpu.block_id x
%block_id_y = gpu.block_id y
@@ -255,7 +255,7 @@ gpu.module @test_kernel {
// CHECK: vector.multi_reduction <add>, {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32>
// CHECK-COUNT-1: vector.multi_reduction <add>, {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32>
- %2 = vector.multi_reduction <add>, %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32>
+ %2 = vector.multi_reduction <add>, %1, %acc [1]: vector<32x128xf32> to vector<32xf32>
%3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r>
xegpu.store_nd %2, %3[0] {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r>
gpu.return
@@ -274,7 +274,7 @@ gpu.module @test_kernel {
%0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r>
%1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32>
// CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32>
- %2 = vector.broadcast %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32>
+ %2 = vector.broadcast %1 : vector<64xf32> to vector<16x64xf32>
%3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l>
xegpu.store_nd %2, %3[0, 0] {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l>
gpu.return
@@ -313,7 +313,7 @@ gpu.module @test_kernel {
%0 = xegpu.create_nd_tdesc %a : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l>
%1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32>
// CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32>
- %2 = vector.transpose %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32>
+ %2 = vector.transpose %1, [1, 0] : vector<32x8xf32> to vector<8x32xf32>
%3 = xegpu.create_nd_tdesc %b : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t>
xegpu.store_nd %2, %3[0, 0] {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t>
gpu.return
@@ -325,8 +325,8 @@ gpu.module @test_kernel {
gpu.module @test_kernel {
gpu.func @test_vector_constant_mask(%src: ui64, %dst: ui64) {
//CHECK: arith.constant dense<true> : vector<16xi1>
- %mask = vector.constant_mask [32] {layout_result_0 = #l} : vector<32xi1>
- %cst = arith.constant {layout_result_0 = #l} dense<[
+ %mask = vector.constant_mask [32] : vector<32xi1>
+ %cst = arith.constant dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -344,8 +344,8 @@ gpu.module @test_kernel {
gpu.func @test_vector_create_mask(%src: ui64, %dst: ui64) {
%c16 = arith.constant 16 : index
//CHECK-COUNT-2: vector.create_mask {{.*}} : vector<16xi1>
- %mask = vector.create_mask %c16 {layout_result_0 = #l} : vector<32xi1>
- %cst = arith.constant {layout_result_0 = #l} dense<[
+ %mask = vector.create_mask %c16 : vector<32xi1>
+ %cst = arith.constant dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -365,8 +365,8 @@ gpu.module @test_kernel {
//CHECK: [[cst:%.+]] = arith.constant dense<16> : vector<16xindex>
//CHECK: [[step:%.+]] = vector.step : vector<16xindex>
//CHECK: arith.addi [[step]], [[cst]] : vector<16xindex>
- %step = vector.step {layout_result_0 = #l} : vector<32xindex>
- %mask = vector.create_mask %c16 {layout_result_0 = #l} : vector<32xi1>
+ %step = vector.step : vector<32xindex>
+ %mask = vector.create_mask %c16 : vector<32xi1>
%ld = xegpu.load %src[%step], %mask {chunk_size = 1, layout = #l, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
xegpu.store %ld, %dst[%step], %mask {chunk_size = 1, layout = #l, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
@@ -403,7 +403,7 @@ gpu.module @test_kernel {
%a = xegpu.load_nd %a_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
%b = xegpu.load_nd %b_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
%a1 = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16>
- %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+ %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
%c_tdesc = xegpu.create_nd_tdesc %C : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c>
xegpu.store_nd %c, %c_tdesc[0, 0] {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c>
gpu.return
@@ -421,7 +421,7 @@ gpu.module @test_kernel {
%c0 = arith.constant 0 : index
%a_tdesc = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a>
%a = xegpu.load_nd %a_tdesc[0, 0] {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16>
- %a_reduce = vector.multi_reduction <add>, %a, %acc {layout_operand_0 = #a, layout_result_0 = #xegpu.slice<#a, dims = [0, 1]>} [0, 1] : vector<16x16xf16> to f16
+ %a_reduce = vector.multi_reduction <add>, %a, %acc [0, 1] : vector<16x16xf16> to f16
%13 = xegpu.convert_layout %a_reduce <{input_layout = #xegpu.slice<#a, dims = [0, 1]>, target_layout = #xegpu.slice<#a, dims = [0, 1]>}> : f16
memref.store %13, %arg1[%c0] : memref<4xf16>
gpu.return
@@ -514,7 +514,7 @@ gpu.module @test_kernel {
// CHECK-LABEL: load_with_offsets
// CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> {
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+ %cst = arith.constant dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -522,7 +522,7 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+ %mask = vector.create_mask %c17 : vector<32xi1>
%ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
gpu.return %ld : vector<32xf32>
@@ -534,7 +534,7 @@ gpu.module @test_kernel {
// CHECK-LABEL: store_with_offsets
// CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1>
gpu.func @store_with_offsets(%src: ui64) {
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+ %cst = arith.constant dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -542,9 +542,9 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+ %mask = vector.create_mask %c17 : vector<32xi1>
- %st_vec = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<1023.0>: vector<32xf32>
+ %st_vec = arith.constant dense<1023.0>: vector<32xf32>
xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
@@ -561,7 +561,7 @@ gpu.module @test_kernel {
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
// CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> {
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+ %cst = arith.constant dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -569,7 +569,7 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+ %mask = vector.create_mask %c17 : vector<32xi1>
%ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
gpu.return %ld : vector<32x4xf32>
}
@@ -585,7 +585,7 @@ gpu.module @test_kernel {
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
// CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1>
gpu.func @store_with_offsets_chunk(%src: ui64) {
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16]>} dense<[
+ %cst = arith.constant dense<[
0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
@@ -593,9 +593,9 @@ gpu.module @test_kernel {
]> : vector<32xindex>
%c17 = arith.constant 17: index
- %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout<inst_data = [16]>}: vector<32xi1>
+ %mask = vector.create_mask %c17 : vector<32xi1>
- %st_vec = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [16, 2]>} dense<1023.>: vector<32x4xf32>
+ %st_vec = arith.constant dense<1023.>: vector<32x4xf32>
xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
gpu.return
}
@@ -614,14 +614,14 @@ gpu.module @test_kernel {
// CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
// CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
gpu.func @preserve_unit_dim_of_load_inst_data(%src: ui64) -> vector<1x1x32xf32> {
- %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[
+ %cst = arith.constant dense<[[
[0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184,
192, 200, 208, 216, 224, 232, 240, 248]
]]> : vector<1x1x32xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1>
+ %mask = arith.constant dense<true> : vector<1x1x32xi1>
%ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
gpu.return %ld : vector<1x1x32xf32>
@@ -656,7 +656,7 @@ gpu.module @test_kernel {
%a = xegpu.load_nd %a_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
%b = xegpu.load_nd %b_tdesc[%c0, %c0] {layout = #l}: !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
- %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32>
+ %result = arith.addf %a, %b : vector<1x32xf32>
xegpu.store_nd %result, %c_tdesc[%c0, %c0] {layout = #l}: vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
gpu.return
}
@@ -685,14 +685,14 @@ gpu.module @test_kernel {
// CHECK: xegpu.store [[v5]], [[arg2]]{{\[}}[[c2]]], [[c0]]
// CHECK-SAME: vector<1x1x16xf32>, ui64, vector<1x1x16xindex>, vector<1x1x16xi1>
gpu.func @load_add_store_leading_unit_dims(%A: ui64, %B: ui64, %C: ui64) {
- %cst = arith.constant {layout_result_0 = #inst_data} dense<[
+ %cst = arith.constant dense<[
[[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]]
]> : vector<1x1x32xindex>
- %mask = arith.constant {layout_result_0 = #inst_data} dense<true> : vector<1x1x32xi1>
+ %mask = arith.constant dense<true> : vector<1x1x32xi1>
%a = xegpu.load %A[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
%b = xegpu.load %B[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
- %addf = arith.addf %a, %b {layout_result_0 = #inst_data} : vector<1x1x32xf32>
+ %addf = arith.addf %a, %b : vector<1x1x32xf32>
xegpu.store %addf, %C[%cst], %mask {chunk_size = 1, layout = #inst_data, l1_hint = #xegpu.cache_hint<cached>} : vector<1x1x32xf32>, ui64, vector<1x1x32xindex>, vector<1x1x32xi1>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 94e8b7504a1d6..3e8d183242a91 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -4,18 +4,14 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: unary_ops_sg_layout_only
gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
// CHECK: math.exp {{.*}} : vector<12x8xf32>
%exp = math.exp %load_a
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
: vector<24x32xf32>
// CHECK: arith.negf {{.*}} : vector<12x8xf32>
%negf = arith.negf %exp
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8]>}
: vector<24x32xf32>
%anchor = xegpu.convert_layout %negf
<{
@@ -27,18 +23,14 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: unary_ops
gpu.func @unary_ops(%a: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
// CHECK: math.exp {{.*}} : vector<12x8xf32>
%exp = math.exp %load_a
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
// CHECK: arith.negf {{.*}} : vector<12x8xf32>
%negf = arith.negf %exp
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
%anchor = xegpu.convert_layout %negf
<{
@@ -50,24 +42,16 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: binary_ops
gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
// CHECK: arith.addf {{.*}}, {{.*}} : vector<12x8xf32>
- %addf = arith.addf %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : vector<24x32xf32>
+ %addf = arith.addf %load_a, %load_b : vector<24x32xf32>
// CHECK: math.powf {{.*}}, {{.*}} : vector<12x8xf32>
- %powf = math.powf %addf, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : vector<24x32xf32>
+ %powf = math.powf %addf, %load_b : vector<24x32xf32>
%anchor = xegpu.convert_layout %powf
<{
input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>,
@@ -78,28 +62,20 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: ternary_ops
gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi1>
- -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi1> -> !xegpu.tensor_desc<24x32xi1>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xi1>
+ : !xegpu.tensor_desc<24x32xi1> -> vector<24x32xi1>
// CHECK: arith.select {{.*}}, {{.*}}, {{.*}} : vector<12x8xi1>, vector<12x8xf32>
%select = arith.select %load_c, %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi1>, vector<24x32xf32>
// CHECK: math.fma {{.*}}, {{.*}}, {{.*}} : vector<12x8xf32>
%fma = math.fma %load_a, %load_b, %select
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
%anchor = xegpu.convert_layout %fma
<{
@@ -111,23 +87,17 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: type_conversion_ops
gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xi32>
- -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xi32>
+ : !xegpu.tensor_desc<24x32xi32> -> vector<24x32xi32>
// CHECK: arith.truncf {{.*}} : vector<12x8xf32> to vector<12x8xf16>
%truncf = arith.truncf %load_a
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32> to vector<24x32xf16>
// CHECK: arith.bitcast {{.*}} : vector<12x8xf16> to vector<12x8xi16>
%bitcast = arith.bitcast %truncf
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf16> to vector<24x32xi16>
%anchor = xegpu.convert_layout %bitcast
<{
@@ -139,33 +109,23 @@ gpu.module @test_elementwise_ops {
// CHECK-LABEL: comparison_ops
gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi32>
- -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- %tdesc_d = xegpu.create_nd_tdesc %d : memref<24x32xi32>
- -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32>
+ %tdesc_d = xegpu.create_nd_tdesc %d : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xi32>
+ : !xegpu.tensor_desc<24x32xi32> -> vector<24x32xi32>
%load_d = xegpu.load_nd %tdesc_d[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
- -> vector<24x32xi32>
+ : !xegpu.tensor_desc<24x32xi32> -> vector<24x32xi32>
// CHECK: arith.cmpf ult, {{.*}}, {{.*}} : vector<12x8xf32>
%cmpf = arith.cmpf ult, %load_a, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xf32>
// CHECK: arith.cmpi eq, {{.*}}, {{.*}} : vector<12x8xi32>
%cmpi = arith.cmpi eq, %load_c, %load_d
- {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
: vector<24x32xi32>
%res = arith.select %cmpi, %cmpi, %cmpf : vector<24x32xi1>, vector<24x32xi1>
%anchor = xegpu.convert_layout %res
@@ -179,25 +139,19 @@ gpu.module @test_elementwise_ops {
// 1 to N decomposition of elementwise operations
// CHECK-LABEL: elementwise_ops_rr_assignment
gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
- %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
- %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32>
- -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+ %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
+ %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
- -> vector<24x32xf32>
+ : !xegpu.tensor_desc<24x32xf32> -> vector<24x32xf32>
// CHECK-COUNT-12: arith.negf {{.*}} : vector<2x2xf32>
// CHECK-NOT: arith.negf
%negf = arith.negf %load_a
- {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: vector<24x32xf32>
// CHECK-COUNT-12: math.powf {{.*}}, {{.*}} : vector<2x2xf32>
// CHECK-NOT: math.powf
%powf = math.powf %negf, %load_b
- {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
: vector<24x32xf32>
%anchor = xegpu.convert_layout %powf
<{
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 2f43f9a840173..b1a6d81bc1140 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -17,9 +17,9 @@ gpu.module @test_distribution {
// CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
// CHECK-NOT: xegpu.load_nd
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
%anchor = xegpu.convert_layout %load
<{
@@ -34,23 +34,23 @@ gpu.module @test_distribution {
// CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NOT: xegpu.store_nd
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
gpu.return
}
// CHECK-LABEL: prefetch_nd
gpu.func @prefetch_nd(%src: memref<256x128xf32>) {
- // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// CHECK-NOT: xegpu.prefetch_nd
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.prefetch_nd %tdesc[0, 0]
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
+ xegpu.prefetch_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<256x128xf32>
gpu.return
}
@@ -64,14 +64,14 @@ gpu.module @test_distribution {
// CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
// CHECK-NOT: xegpu.dpas
%tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
- -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf16>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf16>
-> vector<256x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16>
- -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+ -> !xegpu.tensor_desc<128x256xf16>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<128x256xf16, #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+ : !xegpu.tensor_desc<128x256xf16>
-> vector<128x256xf16>
%dpas = xegpu.dpas %load_a, %load_b
{layout_a = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -84,17 +84,17 @@ gpu.module @test_distribution {
// CHECK-LABEL: vector_reduce_dim_1
gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
+ %cst = arith.constant dense<1.0> : vector<256xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
- -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
+ -> !xegpu.tensor_desc<256x64xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>}
- : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
+ : !xegpu.tensor_desc<256x64xf32>
-> vector<256x64xf32>
// CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[C0:.*]] [1] : vector<16x64xf32> to vector<16xf32>
// CHECK-NOT: vector.multi_reduction
// CHECK-COUNT-2: arith.addf {{.*}}, {{.*}} : vector<16xf32>
// CHECK-NOT: arith.addf
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
+ %reduce = vector.multi_reduction <add>, %load, %cst [1]
: vector<256x64xf32> to vector<256xf32>
%anchor = xegpu.convert_layout %reduce
<{
@@ -137,13 +137,13 @@ gpu.module @test_distribution {
// CHECK-LABEL: vector_transpose
gpu.func @vector_transpose(%src: memref<256x128xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
// CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32>
// CHECK-NOT: vector.transpose
- %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
+ %trans = vector.transpose %load, [1, 0]
: vector<256x128xf32> to vector<128x256xf32>
%anchor = xegpu.convert_layout %trans
<{
@@ -157,7 +157,7 @@ gpu.module @test_distribution {
gpu.func @vector_mask_2D() {
// CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
// CHECK-NOT: vector.create_mask
- %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+ %constant_mask = vector.constant_mask [16, 16] : vector<256x128xi1>
%anchor = xegpu.convert_layout %constant_mask
<{
input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
@@ -170,7 +170,7 @@ gpu.module @test_distribution {
// CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1>
// CHECK-NOT: vector.create_mask
%cst16 = arith.constant 16 : index
- %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
+ %constant_mask = vector.create_mask %cst16, %cst16 : vector<256x128xi1>
%anchor = xegpu.convert_layout %constant_mask
<{
input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>,
@@ -183,13 +183,13 @@ gpu.module @test_distribution {
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32>
// CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<8x1xf32> to vector<8x128xf32>
gpu.func @distribute_shapecast_expandunitdims_broadcast(%arg0: memref<4096x128xf32>, %arg1: memref<4096x128xf32>) {
- %cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>, dims = [1]>} dense<0xFF800000> : vector<256xf32>
+ %cst_0 = arith.constant dense<0xFF800000> : vector<256xf32>
%block_id_x = gpu.block_id x
- %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>>
- %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>> -> vector<256x128xf32>
- %2 = vector.multi_reduction <maximumf>, %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32>
- %3 = vector.shape_cast %2 {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1], inst_data = [8, 1]>, dims = [1]>} : vector<256xf32> to vector<256x1xf32>
- %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : vector<256x1xf32>to vector<256x128xf32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+ %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>} : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<256x128xf32>
+ %2 = vector.multi_reduction <maximumf>, %1, %cst_0 [1] : vector<256x128xf32> to vector<256xf32>
+ %3 = vector.shape_cast %2 : vector<256xf32> to vector<256x1xf32>
+ %4 = vector.broadcast %3 : vector<256x1xf32>to vector<256x128xf32>
%9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
xegpu.store_nd %4, %9[%block_id_x, 0] <{layout =#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 128], inst_data = [8, 16]>}>: vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
gpu.return
@@ -239,11 +239,11 @@ gpu.module @test_distribution {
// CHECK: %[[FINAL_RED1:.*]] = vector.multi_reduction <add>, %[[SLM_LOAD1]], %[[FINAL_NEUTRAL]] [1] : vector<4x16xf32> to vector<4xf32>
// CHECK: %[[RES1:.*]] = arith.addf %[[FINAL_RED1]], %[[CST_ACC1]] : vector<4xf32>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>} dense<0> : vector<8x256xindex>
- %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>} dense<0.000000e+00> : vector<8xf32>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>} dense<true> : vector<8x256xi1>
+ %offset = arith.constant dense<0> : vector<8x256xindex>
+ %acc = arith.constant dense<0.000000e+00> : vector<8xf32>
+ %mask = arith.constant dense<true> : vector<8x256xi1>
%val = xegpu.load %arg0[%offset], %mask <{layout = #xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>}> : memref<2048xf32, 1>, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32>
- %reduce = vector.multi_reduction <add>, %val, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>} [1] : vector<8x256xf32> to vector<8xf32>
+ %reduce = vector.multi_reduction <add>, %val, %acc [1] : vector<8x256xf32> to vector<8xf32>
%anchor = xegpu.convert_layout %reduce
<{
input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 16], sg_data = [4, 16]>, dims = [1]>,
@@ -278,8 +278,8 @@ gpu.module @test_distribution {
// CHECK: %[[ADD4:.*]] = arith.addi %[[STEP]], %[[BCST4]] : vector<4xindex>
// CHECK: %[[RES0:.*]] = vector.broadcast %[[ADD0]] : vector<4xindex> to vector<16x4xindex>
// CHECK: %[[RES1:.*]] = vector.broadcast %[[ADD4]] : vector<4xindex> to vector<16x4xindex>
- %2 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>, dims = [0]>} : vector<8xindex>
- %bcast = vector.broadcast %2 {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>} : vector<8xindex> to vector<256x8xindex>
+ %2 = vector.step : vector<8xindex>
+ %bcast = vector.broadcast %2 : vector<8xindex> to vector<256x8xindex>
%anchor = xegpu.convert_layout %bcast
<{
input_layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 4]>,
@@ -301,14 +301,13 @@ gpu.module @test_distribution {
// CHECK-SAME: %[[ARG_0:.*]]: memref<128x1xf32>
gpu.func @broadcast(%src: memref<128x1xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<128x1xf32>
- -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<128x1xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<128x1xf32>
-> vector<128x1xf32>
// CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16x1xf32> to vector<16x32xf32>
// CHECK-NOT: vector.broadcast
%broadcast = vector.broadcast %load
- {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
: vector<128x1xf32> to vector<128x64xf32>
%anchor = xegpu.convert_layout %broadcast
<{
@@ -324,12 +323,12 @@ gpu.module @test_distribution {
%c0 = arith.constant 0 : index
%c256 = arith.constant 256 : index
%c1024 = arith.constant 1024 : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
// CHECK-LABEL: scf.for
scf.for %arg2 = %c0 to %c1024 step %c256 {
- %3 = xegpu.load_nd %0[%arg2] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- xegpu.store_nd %3, %1[%arg2] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+ %3 = xegpu.load_nd %0[%arg2] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
+ xegpu.store_nd %3, %1[%arg2] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
}
gpu.return
}
@@ -339,9 +338,9 @@ gpu.module @test_distribution {
%c10_i32 = arith.constant 10 : i32
%c0_i32 = arith.constant 0 : i32
%c256 = arith.constant 256 : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
- %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+ %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
+ %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
// CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
%3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
%4 = arith.cmpi slt, %arg3, %c10_i32 : i32
@@ -350,9 +349,9 @@ gpu.module @test_distribution {
} do {
// CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
^bb0(%arg2: vector<256xf32>, %arg3: i32):
- xegpu.store_nd %arg2, %2[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+ xegpu.store_nd %arg2, %2[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
%4 = arith.addi %arg3, %c1_i32 : i32
- %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
scf.yield %6, %4 : vector<256xf32>, i32
}
gpu.return
@@ -361,23 +360,23 @@ gpu.module @test_distribution {
gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
%c10 = arith.constant 10 : index
%0 = gpu.subgroup_id : index
- %1 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
- %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+ %1 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+ %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
%3 = arith.cmpi eq, %0, %c10 : index
// CHECK-LABEL: scf.if
// CHECK-SAME: (vector<16xf32>, vector<16xf32>)
%4 = scf.if %3 -> (vector<256xf32>) {
- %5 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %5 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>, vector<16xf32>
scf.yield %5 : vector<256xf32>
} else {
- %5 = xegpu.load_nd %2[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+ %5 = xegpu.load_nd %2[0] {layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>, vector<16xf32>
scf.yield %5 : vector<256xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
- xegpu.store_nd %4, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+ }
+ xegpu.store_nd %4, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32>
gpu.return
}
@@ -407,10 +406,10 @@ gpu.module @test_distribution {
}
gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
- %0 = xegpu.create_nd_tdesc %arg0 : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
// CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
// CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
- %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
+ %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
gpu.return
>From 3b6aa925017c508d8bb02946398eb2b65c9eb760 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 2 May 2026 03:31:20 +0000
Subject: [PATCH 06/11] pass all tests
---
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 20 +-
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 325 +++++++++---------
2 files changed, 170 insertions(+), 175 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 5cd1a8e9c83ec..b295c74884447 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -175,9 +175,21 @@ static void propagateRegionResultsToYieldOperands(
if (successor.isParent()) {
// For parent successor, get layout from external use points of the
// parent op's results.
- layout = getLayoutFromUsePoints(regionBranchOp->getResult(i));
+ auto regionResult = regionBranchOp->getResult(i);
+ layout = getLayoutFromUsePoints(regionResult);
if (layout)
- xegpu::setTemporaryLayout(regionBranchOp->getResult(i), layout);
+ xegpu::setTemporaryLayout(regionResult, layout);
+ if (auto tensorDescTy =
+ dyn_cast<xegpu::TensorDescType>(regionResult.getType())) {
+ auto tDescLayout = tensorDescTy.getLayoutAttr();
+ if (!tDescLayout) {
+ auto typeWithLayout = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
+ layout);
+ regionResult.setType(typeWithLayout);
+ }
+ }
} else {
// For region successor, get layout from the target region's block
// arg use points (e.g., "before/cond" region args for scf.while
@@ -186,7 +198,9 @@ static void propagateRegionResultsToYieldOperands(
}
if (!layout)
continue;
- if (isa<VectorType>(succOps[i].getType()))
+ auto operandType = succOps[i].getType();
+ if (isa<VectorType>(operandType) ||
+ dyn_cast<xegpu::TensorDescType>(operandType))
xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i), layout);
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index f9697d83baf58..0e79a8056418a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -39,9 +39,9 @@ gpu.module @test_distribution {
//CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index
//CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
%anchor = xegpu.convert_layout %load
<{
@@ -56,24 +56,25 @@ gpu.module @test_distribution {
gpu.func @store_nd(%src: memref<256x128xf32>) {
//CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
gpu.return
}
// CHECK-LABEL: prefetch_nd
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
gpu.func @prefetch_nd(%src: memref<256x128xf32>) {
- //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%cst0 = arith.constant 0 : index
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
xegpu.prefetch_nd %tdesc[%cst0, %cst0]
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}>
+ : !xegpu.tensor_desc<256x128xf32>
gpu.return
}
@@ -81,14 +82,14 @@ gpu.module @test_distribution {
gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
// CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
%tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<128x128xf16>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<128x128xf16>
-> vector<128x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+ -> !xegpu.tensor_desc<128x128xf16>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+ : !xegpu.tensor_desc<128x128xf16>
-> vector<128x128xf16>
%dpas = xegpu.dpas %load_a, %load_b
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
@@ -102,19 +103,15 @@ gpu.module @test_distribution {
gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
// CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>, layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>, layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
%tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
- order = [1, 0]>>
+ -> !xegpu.tensor_desc<128x128xf16>
%load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
order = [1, 0]>}
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1],
- order = [1, 0]>>
+ : !xegpu.tensor_desc<128x128xf16>
-> vector<128x128xf16>
%tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16>
- -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
- order = [1, 0]>>
+ -> !xegpu.tensor_desc<128x128xf16>
%load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]> }
- : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1],
- order = [1, 0]>>
+ : !xegpu.tensor_desc<128x128xf16>
-> vector<128x128xf16>
%dpas = xegpu.dpas %load_a, %load_b
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1],
@@ -130,13 +127,12 @@ gpu.module @test_distribution {
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32>
gpu.func @broadcast_dim1(%src: memref<256x1xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32>
- -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x1xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x1xf32>
-> vector<256x1xf32>
// CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32>
%broadcast = vector.broadcast %load
- {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
: vector<256x1xf32> to vector<256x32xf32>
%anchor = xegpu.convert_layout %broadcast
<{input_layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [32, 32], lane_layout = [8, 1], lane_data = [1, 1]>,
@@ -149,13 +145,12 @@ gpu.module @test_distribution {
// CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32>
gpu.func @broadcast_dim0(%src: memref<1x128xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32>
- -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<1x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 4], sg_data = [1, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<1x128xf32>
-> vector<1x128xf32>
// CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32>
%broadcast = vector.broadcast %load
- {layout_result_0 = #xegpu.layout<sg_layout = [1, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<1x128xf32> to vector<32x128xf32>
%anchor = xegpu.convert_layout %broadcast
<{
@@ -178,15 +173,15 @@ gpu.module @test_distribution {
%block_id_y = gpu.block_id y
%0 = arith.muli %block_id_x, %c128 : index
%1 = arith.muli %block_id_y, %c128 : index
- %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+ %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32>
// CHECK: [[DESC_A:%.+]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x128xf16>
// CHECK: [[DESC_B:%.+]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x16xf16>
- %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
- %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+ %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
+ %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
// load_nd with offset
- %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
- %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+ %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}: !xegpu.tensor_desc<128x128xf32> -> vector<128x128xf32>
+ %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
+ %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
// scf.for loop
// CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
// CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
@@ -203,14 +198,12 @@ gpu.module @test_distribution {
layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
: vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
- %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+ %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
+ %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}: !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
- layout_result_1 = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
- layout_result_2 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+ }
// store_nd with offset
- xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+ xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32>
gpu.return
}
@@ -228,9 +221,9 @@ gpu.module @test_distribution {
scf.if %cond {
// CHECK-NOT: index.sub
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
} {sg_id_range = #xegpu.range<[0, 32]>}
%cond3 = arith.cmpi sge, %sg_id, %c2 : index
@@ -241,11 +234,11 @@ gpu.module @test_distribution {
// CHECK: %[[C2:.*]] = arith.constant 2 : index
// CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
%tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32>
- -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<128x64xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<128x64xf32>
-> vector<128x64xf32>
- %exp = math.exp %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+ %exp = math.exp %load : vector<128x64xf32>
%anchor = xegpu.convert_layout %exp
<{
input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
@@ -262,9 +255,9 @@ gpu.module @test_distribution {
%c3 = arith.constant 3 : index
%c32 = arith.constant 32 : index
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [8, 4], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
%cond1 = arith.cmpi sge, %sg_id, %c3 : index
%cond2 = arith.cmpi slt, %sg_id, %c32 : index
@@ -275,11 +268,11 @@ gpu.module @test_distribution {
// CHECK: %[[C3:.*]] = arith.constant 3 : index
// CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]]
%td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32>
- -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+ -> !xegpu.tensor_desc<128x64xf32>
%ld = xegpu.load_nd %td[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<128x64xf32>
-> vector<128x64xf32>
- %exp = math.exp %ld {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>} : vector<128x64xf32>
+ %exp = math.exp %ld : vector<128x64xf32>
%anchor = xegpu.convert_layout %exp
<{
input_layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>,
@@ -297,8 +290,8 @@ gpu.module @test_distribution {
// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<32x4xi1>
// CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
// CHECK-SAME: : memref<?xf16>, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<0> : vector<256x16xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>} dense<1> : vector<256x16xi1>
+ %offset = arith.constant dense<0> : vector<256x16xindex>
+ %mask = arith.constant dense<1> : vector<256x16xi1>
%load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 4]>, l1_hint = #xegpu.cache_hint<cached>}
: memref<?xf16>, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16>
gpu.return
@@ -312,12 +305,10 @@ gpu.module @test_distribution {
// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
// CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
- %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1>
+ %val = arith.constant dense<25.5> : vector<256xf16>
+ %offset = arith.constant dense<0> : vector<256xindex>
+ %mask = arith.constant dense<1> : vector<256xi1>
xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
- layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
- layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
l1_hint = #xegpu.cache_hint<cached>}
: vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
gpu.return
@@ -330,8 +321,8 @@ gpu.module @test_distribution {
// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
// CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint<cached>}>
// CHECK-SAME: : memref<?xf16>, vector<8xindex>, vector<8xi1> -> vector<8x4xf16>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
+ %offset = arith.constant dense<0> : vector<256xindex>
+ %mask = arith.constant dense<1> : vector<256xi1>
%load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 4]>, l1_hint = #xegpu.cache_hint<cached>}
: memref<?xf16>, vector<256xindex>, vector<256xi1> -> vector<256x4xf16>
gpu.return
@@ -381,7 +372,7 @@ gpu.module @test_distribution {
//CHECK: [[c128:%.+]] = arith.constant 128 : index
//CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index
//CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
- %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} dense<1.0> : vector<64x128xf32>
+ %cst = arith.constant dense<1.0> : vector<64x128xf32>
%mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
gpu.return
@@ -389,14 +380,14 @@ gpu.module @test_distribution {
// CHECK-LABEL: @vector_reduce_dim_0
gpu.func @vector_reduce_dim_0(%src: memref<4x128xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} dense<1.0> : vector<128xf32>
+ %cst = arith.constant dense<1.0> : vector<128xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32>
- -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
+ -> !xegpu.tensor_desc<4x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>}
- : !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
+ : !xegpu.tensor_desc<4x128xf32>
-> vector<4x128xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} [0]
+ %reduce = vector.multi_reduction <add>, %load, %cst [0]
: vector<4x128xf32> to vector<128xf32>
%anchor = xegpu.convert_layout %reduce
<{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>,
@@ -407,14 +398,14 @@ gpu.module @test_distribution {
// CHECK-LABEL: @vector_reduce_dim_1
gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
+ %cst = arith.constant dense<1.0> : vector<256xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
- -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
+ -> !xegpu.tensor_desc<256x64xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>}
- : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
+ : !xegpu.tensor_desc<256x64xf32>
-> vector<256x64xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} [1]
+ %reduce = vector.multi_reduction <add>, %load, %cst [1]
: vector<256x64xf32> to vector<256xf32>
%anchor = xegpu.convert_layout %reduce
<{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>,
@@ -425,12 +416,12 @@ gpu.module @test_distribution {
// CHECK-LABEL: @vector_reduce_4D
gpu.func @vector_reduce_4D(%src: ui64) {
- %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} dense<0.0> : vector<4x2x6xf16>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<0> : vector<4x2x6x32xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<true> : vector<4x2x6x32xi1>
+ %cst_acc = arith.constant dense<0.0> : vector<4x2x6xf16>
+ %offset = arith.constant dense<0> : vector<4x2x6x32xindex>
+ %mask = arith.constant dense<true> : vector<4x2x6x32xi1>
%load = xegpu.load %src[%offset], %mask {layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16>
- %reduce = vector.multi_reduction <add>, %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>, dims = [3]>} [3]
+ %reduce = vector.multi_reduction <add>, %load, %cst_acc [3]
: vector<4x2x6x32xf16> to vector<4x2x6xf16>
%anchor = xegpu.convert_layout %reduce
<{
@@ -456,13 +447,13 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[FINAL:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_FINAL]] [0, 1] : vector<4x4xf32> to f32
// CHECK-DAG: arith.addf %[[FINAL]], %[[CST]] : f32
gpu.func @vector_reduce_scalar_cross_sg(%src: memref<32x32xf32>) {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>} 0.0 : f32
+ %cst = arith.constant 0.0 : f32
%tdesc = xegpu.create_nd_tdesc %src : memref<32x32xf32>
- -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>>
+ -> !xegpu.tensor_desc<32x32xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>}
- : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>>
+ : !xegpu.tensor_desc<32x32xf32>
-> vector<32x32xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 4], sg_data = [8, 8]>, dims = [0, 1]>} [0, 1]
+ %reduce = vector.multi_reduction <add>, %load, %cst [0, 1]
: vector<32x32xf32> to f32
%anchor = xegpu.convert_layout %reduce
<{
@@ -487,7 +478,7 @@ gpu.module @test_distribution {
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
- %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
+ %step = vector.step : vector<128xindex>
%anchor = xegpu.convert_layout %step
<{
input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>,
@@ -507,7 +498,7 @@ gpu.module @test_distribution {
//CHECK: [[BASE:%.+]] = vector.step : vector<8xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
- %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
+ %step = vector.step : vector<128xindex>
%anchor = xegpu.convert_layout %step
<{
input_layout = #xegpu.layout<sg_layout = [16], sg_data = [8]>,
@@ -530,11 +521,11 @@ gpu.module @test_distribution {
// CHECK-LABEL: vector_shape_cast
gpu.func @vector_shape_cast() {
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} dense<10> : vector<128xindex>
- %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
- %muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex>
+ %cst = arith.constant dense<10> : vector<128xindex>
+ %step = vector.step : vector<128xindex>
+ %muli = arith.muli %cst, %step : vector<128xindex>
//CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex>
- %shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>, dims = [0, 1, 2]>} : vector<128xindex> to vector<1x1x1x128xindex>
+ %shape_cast = vector.shape_cast %muli : vector<128xindex> to vector<1x1x1x128xindex>
%anchor = xegpu.convert_layout %shape_cast
<{
input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 4], sg_data = [1, 1, 1, 32]>,
@@ -547,7 +538,7 @@ gpu.module @test_distribution {
gpu.func @vector_broadcast(%arg0: index, %arg1: index) {
%muli = arith.muli %arg0, %arg1 : index
// CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex>
- %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : index to vector<4x2x6x32xindex>
+ %broadcast = vector.broadcast %muli : index to vector<4x2x6x32xindex>
%anchor = xegpu.convert_layout %broadcast
<{
input_layout = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>,
@@ -559,12 +550,12 @@ gpu.module @test_distribution {
// CHECK-LABEL: vector_transpose
gpu.func @vector_transpose(%src: memref<256x32xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
- -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+ -> !xegpu.tensor_desc<256x32xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>}
- : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+ : !xegpu.tensor_desc<256x32xf32>
-> vector<256x32xf32>
//CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32>
- %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>}
+ %trans = vector.transpose %load, [1, 0]
: vector<256x32xf32> to vector<32x256xf32>
%anchor = xegpu.convert_layout %trans
<{
@@ -586,7 +577,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[T6:.*]] = arith.addi %[[T4]], %[[T5]] : index
// CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex>
// CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex>
- %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+ %cst = arith.constant dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
%anchor = xegpu.convert_layout %cst
<{
input_layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>,
@@ -659,7 +650,7 @@ gpu.module @test_distribution {
// CHECK-LABEL: scalar_broadcast
gpu.func @scalar_broadcast(%arg0: index) {
// CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex>
- %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
+ %broadcast = vector.broadcast %arg0 : index to vector<4x1x1xindex>
%anchor = xegpu.convert_layout %broadcast
<{
input_layout = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>,
@@ -678,7 +669,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index
// CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
- %constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+ %constant_mask = vector.constant_mask [8] : vector<32xi1>
%anchor = xegpu.convert_layout %constant_mask
<{
input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
@@ -704,7 +695,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C7:.*]] : index
// CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
- %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+ %constant_mask = vector.constant_mask [16, 16] : vector<256x128xi1>
%anchor = xegpu.convert_layout %constant_mask
<{
input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
@@ -724,7 +715,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1>
%cst8 = arith.constant 8 : index
- %constant_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout<sg_layout = [2], sg_data = [16]>} : vector<32xi1>
+ %constant_mask = vector.create_mask %cst8 : vector<32xi1>
%anchor = xegpu.convert_layout %constant_mask
<{
input_layout = #xegpu.layout<sg_layout = [2], sg_data = [16]>,
@@ -751,7 +742,7 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index
// CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1>
%cst16 = arith.constant 16 : index
- %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x128xi1>
+ %constant_mask = vector.create_mask %cst16, %cst16 : vector<256x128xi1>
%anchor = xegpu.convert_layout %constant_mask
<{
input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>,
@@ -763,16 +754,15 @@ gpu.module @test_distribution {
// CHECK-LABEL: distribute_load_slice_attr
gpu.func @distribute_load_slice_attr() {
%2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
+ %offset = arith.constant dense<0> : vector<256xindex>
+ %mask = arith.constant dense<1> : vector<256xi1>
// CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
// CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
%3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32>
- %4 = vector.broadcast %3 {layout_result_0 =
- #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+ %4 = vector.broadcast %3 : vector<256xf32> to vector<256x256xf32>
%anchor = xegpu.convert_layout %4
<{
input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>,
@@ -801,11 +791,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [1] : vector<1x32x32xf32> to vector<1x32xf32>
// CHECK-DAG: %[[ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x32xf32>
// CHECK-DAG: gpu.return
- %cst_3 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} dense<1.0> : vector<1x32xf32>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<0> : vector<1x32x32xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<true> : vector<1x32x32xi1>
+ %cst_3 = arith.constant dense<1.0> : vector<1x32xf32>
+ %offset = arith.constant dense<0> : vector<1x32x32xindex>
+ %mask = arith.constant dense<true> : vector<1x32x32xi1>
%14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} : memref<?xf32>, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32>
- %15 = vector.multi_reduction <add>, %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32>
+ %15 = vector.multi_reduction <add>, %14, %cst_3 [1] : vector<1x32x32xf32> to vector<1x32xf32>
%anchor = xegpu.convert_layout %15
<{
input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>,
@@ -840,13 +830,13 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[CST_CROSS_SG_1:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
// CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_CROSS_SG_1]] [0] : vector<8x32xf32> to vector<32xf32>
// CHECK-DAG: arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<32xf32>
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} dense<0.0> : vector<128xf32>
+ %cst = arith.constant dense<0.0> : vector<128xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ -> !xegpu.tensor_desc<256x128xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>}
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
+ %reduce = vector.multi_reduction <add>, %load, %cst [0]
: vector<256x128xf32> to vector<128xf32>
%anchor = xegpu.convert_layout %reduce
<{
@@ -876,11 +866,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<1x1x4x4xf32> to vector<1x1xf32>
// CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x1xf32>
// CHECK-DAG: gpu.return
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} dense<0.0> : vector<2x2xf32>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<0> : vector<2x2x128x128xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<true> : vector<2x2x128x128xi1>
+ %cst = arith.constant dense<0.0> : vector<2x2xf32>
+ %offset = arith.constant dense<0> : vector<2x2x128x128xindex>
+ %mask = arith.constant dense<true> : vector<2x2x128x128xi1>
%load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} : memref<?xf32>, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
%anchor = xegpu.convert_layout %reduce
<{
input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>,
@@ -909,11 +899,11 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<16x16x4x4xf32> to vector<16x16xf32>
// CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<16x16xf32>
// CHECK-DAG: gpu.return
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} dense<0.0> : vector<32x32xf32>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<0> : vector<32x32x128x128xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<true> : vector<32x32x128x128xi1>
+ %cst = arith.constant dense<0.0> : vector<32x32xf32>
+ %offset = arith.constant dense<0> : vector<32x32x128x128xindex>
+ %mask = arith.constant dense<true> : vector<32x32x128x128xi1>
%load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} : memref<?xf32>, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32>
- %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
%anchor = xegpu.convert_layout %reduce
<{
input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>,
@@ -924,19 +914,13 @@ gpu.module @test_distribution {
// CHECK-LABEL: load_nd_tdesc_with_anchor_layout
gpu.func @load_nd_tdesc_with_anchor_layout(%src: memref<256x128xf32>) {
- //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>}>
- // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+ -> !xegpu.tensor_desc<256x128xf32>
+ // CHECK: xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
%load = xegpu.load_nd %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16],lane_layout = [1, 16], lane_data = [1, 1]>}>
- : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
- %anchor = xegpu.convert_layout %load
- <{
- input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [32, 16], lane_layout = [1, 16], lane_data = [1, 1]>
- }> : vector<256x128xf32>
gpu.return
}
@@ -963,7 +947,7 @@ gpu.module @test_distribution {
%10 = xegpu.convert_layout %8 <{input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [32, 16]>, target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [16, 16]>}> : vector<32x256xf16>
%11 = xegpu.dpas %9, %10, %arg4 {layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [16, 16]>, layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32>
scf.yield %11 : vector<256x256xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>}
+ }
xegpu.store_nd %6, %2[%0, %1] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>}> : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>>
gpu.return
}
@@ -1001,8 +985,8 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index
// CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index
// CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout<inst_data = [16, 16]>}>: !xegpu.mem_desc<128x256xf32>, index, index -> vector<16x32xf32>
- %0 = xegpu.create_nd_tdesc %arg0 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>>
- %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<128x256xf32, #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>> -> vector<128x256xf32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32>
+ %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>} : !xegpu.tensor_desc<128x256xf32> -> vector<128x256xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [4, 16], sg_data = [32, 16], inst_data = [16, 16]>,
target_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 32], inst_data = [16, 16]>}> : vector<128x256xf32>
%anchor = xegpu.convert_layout %2
@@ -1043,8 +1027,8 @@ gpu.module @test_distribution {
// CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index
// CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index
// CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Z]], %[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout<inst_data = [1, 16, 16]>}>: !xegpu.mem_desc<8x128x256xf32>, index, index, index -> vector<1x16x32xf32>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} dense<0> : vector<8x128x256xindex>
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} dense<true> : vector<8x128x256xi1>
+ %offset = arith.constant dense<0> : vector<8x128x256xindex>
+ %mask = arith.constant dense<true> : vector<8x128x256xi1>
%1 = xegpu.load %arg0[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>} : memref<?xf32>, vector<8x128x256xindex>, vector<8x128x256xi1> -> vector<8x128x256xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [8, 4, 16], sg_data = [1, 32, 16], inst_data = [1, 16, 16]>,
target_layout = #xegpu.layout<sg_layout = [8, 8, 8], sg_data = [1, 16, 32], inst_data = [1, 16, 16]>}> : vector<8x128x256xf32>
@@ -1058,13 +1042,13 @@ gpu.module @test_distribution {
// CHECK-LABEL: convert_layout_reduce_to_scalar
gpu.func @convert_layout_reduce_to_scalar(%arg0: memref<32x32xf32>) {
- %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} dense<true> : vector<32x32xi1>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} dense<0> : vector<32x32xindex>
+ %mask = arith.constant dense<true> : vector<32x32xi1>
+ %offset = arith.constant dense<0> : vector<32x32xindex>
%cst_0 = arith.constant 0.000000e+00 : f32
%intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<32x32xf32> -> index
%10 = arith.index_cast %intptr : index to i64
- %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>}> {layout_operand_1 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, layout_operand_2 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>} : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32>
- %12 = vector.multi_reduction <add>, %11, %cst_0 {layout_operand_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>} [0, 1] : vector<32x32xf32> to f32
+ %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>}> : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32>
+ %12 = vector.multi_reduction <add>, %11, %cst_0 [0, 1] : vector<32x32xf32> to f32
// CHECK-NOT: xegpu.convert_layout
%13 = xegpu.convert_layout %12 <{input_layout = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 32]>, dims = [0, 1]>}> : f32
gpu.return
@@ -1080,28 +1064,25 @@ gpu.module @test_distribution {
gpu.func @distribute_nested_slice(%src: memref<256x256xf32>) {
%tdesc = xegpu.create_nd_tdesc %src : memref<256x256xf32>
- -> !xegpu.tensor_desc<256x256xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>>
+ -> !xegpu.tensor_desc<256x256xf32>
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>}
- : !xegpu.tensor_desc<256x256xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>>
+ : !xegpu.tensor_desc<256x256xf32>
-> vector<256x256xf32>
%load2 = xegpu.convert_layout %load <{input_layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32]>, target_layout = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>, dims=[1, 3]>}> : vector<256x256xf32>
- %scast = vector.shape_cast %load2 {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>, dims=[1, 3]>} : vector<256x256xf32> to vector<256x1x256x1xf32>
+ %scast = vector.shape_cast %load2 : vector<256x256xf32> to vector<256x1x256x1xf32>
- %bcast = vector.broadcast %scast {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 1, 16, 32, 1, 1]>, dims=[2]>, dims=[4]>} : vector<256x1x256x1xf32> to vector<256x16x256x16xf32>
+ %bcast = vector.broadcast %scast : vector<256x1x256x1xf32> to vector<256x16x256x16xf32>
- %scast1 = vector.shape_cast %bcast {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>, dims=[4]>} : vector<256x16x256x16xf32> to vector<256x16x256x16x1xf32>
+ %scast1 = vector.shape_cast %bcast : vector<256x16x256x16xf32> to vector<256x16x256x16x1xf32>
- %bcast1 = vector.broadcast %scast1 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 1]>, dims=[2]>} : vector<256x16x256x16x1xf32> to vector<256x16x256x16x16xf32>
+ %bcast1 = vector.broadcast %scast1 : vector<256x16x256x16x1xf32> to vector<256x16x256x16x16xf32>
- %scast2 = vector.shape_cast %bcast1 {layout_result_0 =
- #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>, layout_operand_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>, dims=[2]>} : vector<256x16x256x16x16xf32> to vector<256x16x1x256x16x16xf32>
+ %scast2 = vector.shape_cast %bcast1 : vector<256x16x256x16x16xf32> to vector<256x16x1x256x16x16xf32>
- %bcast2 = vector.broadcast %scast2 {layout_result_0 =
- #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>, layout_operand_0 =
- #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 1, 32, 16, 16]>} : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32>
+ %bcast2 = vector.broadcast %scast2 : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32>
%anchor = xegpu.convert_layout %bcast2
<{
input_layout = #xegpu.layout<sg_layout = [8, 1, 1, 8, 1, 1], sg_data = [32, 16, 16, 32, 16, 16]>,
@@ -1114,11 +1095,11 @@ gpu.module @test_distribution {
// CHECK: arith.constant dense<1.000000e+00> : vector<16x128xf32>
// CHECK: xegpu.store_nd %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] <{layout = #xegpu.layout<inst_data = [8, 16]>}>
gpu.func @preserve_anchor_layout(%dst: memref<256x128xf32>) {
- %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128]>} dense<1.0> : vector<256x128xf32>
+ %val = arith.constant dense<1.0> : vector<256x128xf32>
%tdesc = xegpu.create_nd_tdesc %dst : memref<256x128xf32>
- -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+ -> !xegpu.tensor_desc<256x128xf32>
xegpu.store_nd %val, %tdesc[0, 0] <{layout = #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>}>
- : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 128], inst_data = [8, 16]>>
+ : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32>
gpu.return
}
@@ -1130,7 +1111,7 @@ gpu.module @test_distribution {
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
-
+
gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
@@ -1142,10 +1123,10 @@ gpu.module @test_distribution {
%block_id_y = gpu.block_id y
%0 = arith.muli %block_id_x, %c128 : index
%1 = arith.muli %block_id_y, %c128 : index
- %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- %3 = xegpu.load_nd %2[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
- %4 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
- %5 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+ %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32>
+ %3 = xegpu.load_nd %2[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>} : !xegpu.tensor_desc<128x128xf32> -> vector<128x128xf32>
+ %4 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
+ %5 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16>
// CHECK: %[[SCF:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
// CHECK-SAME: iter_args(%[[ARG6:.*]] = {{.*}}) ->
@@ -1156,18 +1137,18 @@ gpu.module @test_distribution {
// CHECK: scf.yield %[[C]] : vector<16x16xf32>
%6 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg6 = %3)
-> (vector<128x128xf32>) {
- %8 = xegpu.load_nd %4[0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
- %9 = xegpu.load_nd %5[%arg3, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+ %8 = xegpu.load_nd %4[0, %arg3] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>} : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
+ %9 = xegpu.load_nd %5[%arg3, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>} : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16>
%10 = xegpu.dpas %8, %9, %arg6
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
: vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
scf.yield %10 : vector<128x128xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+ }
%7 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32>
- -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
- xegpu.store_nd %6, %7[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+ -> !xegpu.tensor_desc<128x128xf32>
+ xegpu.store_nd %6, %7[0, 0] {layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]> } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32>
gpu.return
}
@@ -1176,9 +1157,9 @@ gpu.module @test_distribution {
%c10_i32 = arith.constant 10 : i32
%c0_i32 = arith.constant 0 : i32
%c256 = arith.constant 256 : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
- %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+ %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
+ %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
// CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
%3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
@@ -1188,9 +1169,9 @@ gpu.module @test_distribution {
} do {
// CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32)
^bb0(%arg2: vector<256xf32>, %arg3: i32):
- xegpu.store_nd %arg2, %2[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ xegpu.store_nd %arg2, %2[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
%4 = arith.addi %arg3, %c1_i32 : i32
- %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
scf.yield %6, %4 : vector<256xf32>, i32
}
gpu.return
@@ -1200,8 +1181,8 @@ gpu.module @test_distribution {
%c10 = arith.constant 10 : index
%id = gpu.subgroup_id : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
%4 = arith.cmpi eq, %id, %c10 : index
// CHECK-LABEL: scf.if
@@ -1209,19 +1190,19 @@ gpu.module @test_distribution {
%5 = scf.if %4 -> (vector<256xf32>) {
// CHECK-LABEL: xegpu.load_nd
// CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %2 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %2 = xegpu.load_nd %0[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>
scf.yield %2 : vector<256xf32>
} else {
// CHECK-LABEL: xegpu.load_nd
// CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %3 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %3 = xegpu.load_nd %1[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: vector<16xf32>
scf.yield %3 : vector<256xf32>
- } {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [16]>}
- xegpu.store_nd %5, %0[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ }
+ xegpu.store_nd %5, %0[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32>
gpu.return
}
@@ -1229,28 +1210,28 @@ gpu.module @test_distribution {
%c10 = arith.constant 10 : index
%id = gpu.subgroup_id : index
- %t = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
- %d = xegpu.load_nd %t[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+ %t = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
+ %d = xegpu.load_nd %t[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : !xegpu.tensor_desc<256xf32> -> vector<256xf32>
%0 = arith.cmpi eq, %id, %c10 : index
// CHECK-LABEL: scf.if
// CHECK-SAME: (!xegpu.tensor_desc<16xf32>)
- %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>) {
+ %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32>) {
// CHECK-LABEL: xegpu.create_nd_tdesc
// CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
- %2 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ %2 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: !xegpu.tensor_desc<16xf32>
- scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ scf.yield %2 : !xegpu.tensor_desc<256xf32>
} else {
// CHECK-LABEL: xegpu.create_nd_tdesc
// CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
- %3 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ %3 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32>
// CHECK-LABEL: scf.yield
// CHECK-SAME: !xegpu.tensor_desc<16xf32>
- scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ scf.yield %3 : !xegpu.tensor_desc<256xf32>
}
- xegpu.store_nd %d, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+ xegpu.store_nd %d, %1[0] {layout = #xegpu.layout<sg_layout = [16], sg_data = [16]>} : vector<256xf32>, !xegpu.tensor_desc<256xf32>
gpu.return
}
>From fc237d44bc8608702ae3733880016d422afd670c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Sat, 2 May 2026 04:07:11 +0000
Subject: [PATCH 07/11] polish
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp | 4 +++-
mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir | 7 +------
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 5 -----
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 7 +------
4 files changed, 5 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b295c74884447..08a9f92448b1c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -134,7 +134,9 @@ static void propagateResultsToRegularOperands(Operation *op) {
result.setType(typeWithLayout);
}
}
- if (resLayout)
+ // Multi-reduction op may reduce to scalar which needs layout.
+ if (isa<VectorType>(resultType) && resLayout ||
+ isa<vector::MultiDimReductionOp>(op))
xegpu::setTemporaryLayout(result, resLayout);
for (OpOperand &opr : op->getOpOperands()) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index ec553aa33f49b..c8a9530641951 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -244,12 +244,7 @@ gpu.module @xevm_module{
: vector<8x16xf32>
%6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
!xegpu.tensor_desc<8x16xf32>
- %anchor = xegpu.convert_layout %5
- <{
- input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }> : vector<8x16xf32>
- xegpu.store_nd %anchor, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
+ xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
!xegpu.tensor_desc<8x16xf32>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index b1a6d81bc1140..b015943a54897 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -21,11 +21,6 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
- %anchor = xegpu.convert_layout %load
- <{
- input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
- }> : vector<256x128xf32>
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 0e79a8056418a..ff4e0db629083 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -43,11 +43,6 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<256x128xf32>
-> vector<256x128xf32>
- %anchor = xegpu.convert_layout %load
- <{
- input_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>,
- target_layout = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>
- }> : vector<256x128xf32>
gpu.return
}
@@ -1111,7 +1106,7 @@ gpu.module @test_distribution {
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
-
+
gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
>From a1a8d2f93ab5944de4a498cd9c39e3fdd7509ffe Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 4 May 2026 20:20:14 +0000
Subject: [PATCH 08/11] polish
---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 2 +
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 77 +++++++++----------
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 4 -
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 9 +--
4 files changed, 41 insertions(+), 51 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index cafd3f392ff72..bac8f413acd40 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -123,6 +123,8 @@ DistributeLayoutAttr inferInsertSourceLayout(DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
ArrayRef<int64_t> srcShape);
+/// Infers the source layout attribute for an extract operation. Adds
+/// leading dimensions to the source layout to match the source shape size.
DistributeLayoutAttr inferExtractSourceLayout(DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
ArrayRef<int64_t> srcShape);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 08a9f92448b1c..b35224f032d42 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -69,6 +69,18 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
return out;
}
+// Sets the layout on a TensorDesc value by updating its type to include
+// the given layout, if the type does not already have a layout attached.
+static void setTensorDescLayout(Value val, xegpu::DistributeLayoutAttr layout) {
+ auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(val.getType());
+ if (!tensorDescTy || tensorDescTy.getLayoutAttr())
+ return;
+ auto typeWithLayout = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+ val.setType(typeWithLayout);
+}
+
// the walkRegionBackward() is a recursive function
// the input rootOp is the function operation, which is also a region op.
// it recursively processes the region op in reverse topological order.
@@ -122,26 +134,23 @@ static void propagateResultsToRegularOperands(Operation *op) {
xegpu::DistributeLayoutAttr resLayout = getLayoutFromUsePoints(result);
Type resultType = result.getType();
- // recover layout for tensor Descriptor type, which is a special case since
- // its layout is not stored as an attribute but encoded in the type itself.
- // For vector type, we attach the layout as an attribute to op.
- if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
- auto layout = tensorDescTy.getLayoutAttr();
- if (!layout) {
- auto typeWithLayout = xegpu::TensorDescType::get(
- tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
- result.setType(typeWithLayout);
- }
- }
- // Multi-reduction op may reduce to scalar which needs layout.
- if (isa<VectorType>(resultType) && resLayout ||
- isa<vector::MultiDimReductionOp>(op))
+ if (!resLayout)
+ return;
+
+ // Recover layout for TensorDesc type results by updating the type to include
+ // the layout. For vector type
+ if (isa<xegpu::TensorDescType>(resultType))
+ setTensorDescLayout(result, resLayout);
+
+ // Recover layout for vector type results, or for multi-reduction ops which
+ // may reduce to a scalar that still needs a layout.
+ if (isa<VectorType>(resultType) || isa<vector::MultiDimReductionOp>(op))
xegpu::setTemporaryLayout(result, resLayout);
for (OpOperand &opr : op->getOpOperands()) {
xegpu::DistributeLayoutAttr operandLayout =
xegpu::inferSourceLayoutFromResult(opr, resLayout);
+ // Recover layout for vector operands
if (isa<VectorType>(opr.get().getType()) && operandLayout)
xegpu::setTemporaryLayout(opr, operandLayout);
}
@@ -179,18 +188,11 @@ static void propagateRegionResultsToYieldOperands(
// parent op's results.
auto regionResult = regionBranchOp->getResult(i);
layout = getLayoutFromUsePoints(regionResult);
- if (layout)
+ if (layout) {
+ // set layout for the region op, like scf.loop
xegpu::setTemporaryLayout(regionResult, layout);
- if (auto tensorDescTy =
- dyn_cast<xegpu::TensorDescType>(regionResult.getType())) {
- auto tDescLayout = tensorDescTy.getLayoutAttr();
- if (!tDescLayout) {
- auto typeWithLayout = xegpu::TensorDescType::get(
- tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
- layout);
- regionResult.setType(typeWithLayout);
- }
+ if (isa<xegpu::TensorDescType>(regionResult.getType()))
+ setTensorDescLayout(regionResult, layout);
}
} else {
// For region successor, get layout from the target region's block
@@ -203,6 +205,7 @@ static void propagateRegionResultsToYieldOperands(
auto operandType = succOps[i].getType();
if (isa<VectorType>(operandType) ||
dyn_cast<xegpu::TensorDescType>(operandType))
+ // recover layout for yield op operands
xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i), layout);
}
}
@@ -228,17 +231,10 @@ static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
continue;
// Recover layout for tensor_desc block args by updating the type.
- if (auto tensorDescTy =
- dyn_cast<xegpu::TensorDescType>(regionArg.getType())) {
- if (!tensorDescTy.getLayoutAttr()) {
- auto typeWithLayout = xegpu::TensorDescType::get(
- tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
- layout);
- regionArg.setType(typeWithLayout);
- }
- }
+ if (isa<xegpu::TensorDescType>(regionArg.getType()))
+ setTensorDescLayout(regionArg, layout);
+ // Recover layout for region op operands, like scf.for's init operands.
// Find all predecessor values that flow into this block argument.
SmallVector<Value> predValues;
regionOp.getPredecessorValues(regionSuccessor, inputIdx, predValues);
@@ -302,11 +298,6 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
rootOp->walk([&](gpu::GPUFuncOp func) {
processFunc(func.getBody(), func.getName());
});
- // dump out the root op here for debug purpose
-
- llvm::dbgs() << "After recovering temporary layout attributes for function: "
- << rootOp->getName() << "\n";
- rootOp->dump();
return true;
}
@@ -494,6 +485,7 @@ xegpu::DistributeLayoutAttr xegpu::inferInsertStridedSliceSourceLayout(
/// Infers the source layout attribute for an insert operation
/// given the result layout attribute, result shape, and source shape. Removes
/// leading dimensions from the result layout to match the source shape size.
+// TODO: add propagation support for insert op
xegpu::DistributeLayoutAttr
xegpu::inferInsertSourceLayout(xegpu::DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
@@ -521,6 +513,8 @@ xegpu::inferInsertSourceLayout(xegpu::DistributeLayoutAttr resLayout,
/// Infers the source layout attribute for extract operation
/// given the result layout attribute, result shape, and source shape. Adds
/// leading dimensions to the source layout to match the source shape size.
+// TODO: add layout attribute interface: expandDims() and use it here.
+// TODO: add propagation support for extract op
xegpu::DistributeLayoutAttr
xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
@@ -567,7 +561,6 @@ xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
resInstData.empty() ? nullptr : toAttr(instData),
resLaneLayout.empty() ? nullptr : toAttr(laneLayout),
resLaneData.empty() ? nullptr : toAttr(laneData), nullptr);
- // TODO: add layout attribute interface: expandDims
return srcLayout;
}
return resLayout;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 1a3bc28cec002..6613af2dfc164 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1403,9 +1403,6 @@ struct ResolveLayoutConflicts {
} // namespace
LogicalResult ResolveLayoutConflicts::run() {
- // dump the IR before resolving layout conflicts for debugging purposes.
- DBGS() << "IR before resolving layout conflicts:\n";
- parentOp->dump();
// Scan all operations in the parent op and resolve layout conflicts at
// tensor descriptor and vector use points.
auto r = parentOp->walk([&](Operation *op) -> WalkResult {
@@ -1448,7 +1445,6 @@ LogicalResult ResolveLayoutConflicts::run() {
return WalkResult::advance();
});
- // dump the IR after resolving layout conflicts for debugging purposes.
DBGS() << "IR after resolving layout conflicts:\n";
parentOp->dump();
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 13288a377e69a..12eb553b3bddc 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -136,10 +136,6 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (!value)
return nullptr;
- if (auto tdescTy =
- dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
- return tdescTy.getLayoutAttr();
-
if (auto result = dyn_cast<OpResult>(value)) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
@@ -162,11 +158,14 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
if (tiedInit)
- // return getDistributeLayoutAttr(tiedInit->get());
return getTemporaryLayout(*tiedInit);
}
}
+ if (auto tdescTy =
+ dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
+ return tdescTy.getLayoutAttr();
+
return nullptr;
}
xegpu::DistributeLayoutAttr
>From a772e2d5c9fe487965dbba1f7b82b218c93c8f5d Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Mon, 4 May 2026 20:52:18 +0000
Subject: [PATCH 09/11] polish the legalization condition of wg distribution
---
.../Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index af82effb9d379..8aa0758943cd1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1562,6 +1562,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
vector::TransposeOp, vector::BroadcastOp,
+ vector::MultiDimReductionOp,
vector::ConstantMaskOp, vector::CreateMaskOp>(
[=](Operation *op) -> bool {
// Check for either a SliceAttr or LayoutAttr on the result.
@@ -1569,13 +1570,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
return isLegal(layout);
});
- target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
- [=](Operation *op) -> bool {
- // Check operand since the result maybe scalar not bearing layout..
- auto layout =
- xegpu::getTemporaryLayout(dyn_cast<vector::MultiDimReductionOp>(op)->getOpOperand(0));
- return isLegal(layout);
- });
+
target.addDynamicallyLegalOp<xegpu::LoadGatherOp>(
[=](xegpu::LoadGatherOp op) -> bool {
auto layout = op.getLayoutAttr();
>From d5e6054bf4b02ef491a2be33d38e7410a49e65e8 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 5 May 2026 16:17:12 +0000
Subject: [PATCH 10/11] address feedback
---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 6 ---
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 48 +++++++++----------
2 files changed, 22 insertions(+), 32 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index bac8f413acd40..5c6fb1397864f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -39,12 +39,6 @@ LogicalResult propagateLayouts(OpBuilder &builder, Operation *target,
LogicalResult resolveLayoutConflicts(Operation *target);
-/// [to-be-deprecated] Set the DistributeLayoutAttr for each OpOperand and
-/// OpResult of of the given operation. If the operation contains regions, it is
-/// also applied recursively to the contained operations operation.
-/// TODO: To be replaced by recoverTemporaryLayouts()
-void recoverTemporaryLayoutsDeprecated(Operation *op);
-
/// Attach layout attributes to all vector-type operands of operations within
/// the given operation's nested region. Reports an error if any vector operand
/// lacks a layout attribute.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index b4928fc8ed0f8..735d6d98e1251 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -526,29 +526,24 @@ xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
auto context = resLayout.getContext();
// construct the source layout by adding unit dimensions to the front of
// result layout
-
- SmallVector<int64_t> sgLayout(srcShapeSize, 1);
- SmallVector<int64_t> sgData(srcShapeSize, 1);
- SmallVector<int64_t> instData(srcShapeSize, 1);
- SmallVector<int64_t> laneLayout(srcShapeSize, 1);
- SmallVector<int64_t> laneData(srcShapeSize, 1);
-
if (dimDiff > 0) {
- auto resSgLayout = resLayout.getEffectiveSgLayoutAsInt();
- auto resSgData = resLayout.getEffectiveSgDataAsInt();
- auto resInstData = resLayout.getEffectiveInstDataAsInt();
- auto resLaneLayout = resLayout.getEffectiveLaneLayoutAsInt();
- auto resLaneData = resLayout.getEffectiveLaneDataAsInt();
-
- for (int i = 0; i < resShapeSize; i++) {
- sgLayout[dimDiff + i] = (resSgLayout.size() == 0) ? 1 : resSgLayout[i];
- sgData[dimDiff + i] = (resSgData.size() == 0) ? 1 : resSgData[i];
- instData[dimDiff + i] = (resInstData.size() == 0) ? 1 : resInstData[i];
- laneLayout[dimDiff + i] =
- (resLaneLayout.size() == 0) ? 1 : resLaneLayout[i];
- laneData[dimDiff + i] = (resLaneData.size() == 0) ? 1 : resLaneData[i];
+ auto sgLayout = resLayout.getEffectiveSgLayoutAsInt();
+ auto sgData = resLayout.getEffectiveSgDataAsInt();
+ auto instData = resLayout.getEffectiveInstDataAsInt();
+ auto laneLayout = resLayout.getEffectiveLaneLayoutAsInt();
+ auto laneData = resLayout.getEffectiveLaneDataAsInt();
+ auto order = resLayout.getEffectiveOrderAsInt();
+
+ for (int i = resShapeSize; i < dimDiff; i++) {
+ sgLayout.insert(sgLayout.begin(), 1);
+ sgData.insert(sgData.begin(), 1);
+ instData.insert(instData.begin(), 1);
+ laneLayout.insert(laneLayout.begin(), 1);
+ laneData.insert(laneData.begin(), 1);
+ order.insert(order.begin(), i);
}
+ DenseI32ArrayAttr orderAttr = resLayout ? resLayout.getOrder() : nullptr;
auto toAttr = [&](ArrayRef<int64_t> v) -> DenseI32ArrayAttr {
if (v.empty())
return DenseI32ArrayAttr();
@@ -556,11 +551,12 @@ xegpu::inferExtractSourceLayout(xegpu::DistributeLayoutAttr resLayout,
return DenseI32ArrayAttr::get(context, v32);
};
auto srcLayout = xegpu::LayoutAttr::get(
- context, resSgLayout.empty() ? nullptr : toAttr(sgLayout),
- resSgData.empty() ? nullptr : toAttr(sgData),
- resInstData.empty() ? nullptr : toAttr(instData),
- resLaneLayout.empty() ? nullptr : toAttr(laneLayout),
- resLaneData.empty() ? nullptr : toAttr(laneData), nullptr);
+ context, sgLayout.empty() ? nullptr : toAttr(sgLayout),
+ sgData.empty() ? nullptr : toAttr(sgData),
+ instData.empty() ? nullptr : toAttr(instData),
+ laneLayout.empty() ? nullptr : toAttr(laneLayout),
+ laneData.empty() ? nullptr : toAttr(laneData),
+ (orderAttr && !orderAttr.empty()) ? nullptr : toAttr(order));
return srcLayout;
}
return resLayout;
@@ -1687,7 +1683,7 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
VectorType valueToStoreTy =
dyn_cast<VectorType>(insert.getValueToStore().getType());
- if (idx == 0) {
+ if ((idx == 0) && valueToStoreTy) {
return xegpu::inferInsertSourceLayout(resLayout, resVecTy.getShape(),
valueToStoreTy.getShape());
}
>From e67c4d736d31d20812751fe2e465dbeebb5c0ea8 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 5 May 2026 16:24:36 +0000
Subject: [PATCH 11/11] address more feedback
---
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 20 ++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 6613af2dfc164..308e5c98d444b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -404,18 +404,18 @@ class LayoutInfoPropagation
visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) override;
- void visitBranchOperand(OpOperand &operand) override {};
+ void visitBranchOperand(OpOperand &operand) override{};
- void visitCallOperand(OpOperand &operand) override {};
+ void visitCallOperand(OpOperand &operand) override{};
void
visitNonControlFlowArguments(RegionSuccessor &successor,
- ArrayRef<BlockArgument> arguments) override {};
+ ArrayRef<BlockArgument> arguments) override{};
- void visitExternalCall(CallOpInterface call,
- ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results) override {
- };
+ void
+ visitExternalCall(CallOpInterface call,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) override{};
void setToExitState(LayoutInfoLattice *lattice) override {
(void)lattice->meet(LayoutInfo());
@@ -1445,8 +1445,10 @@ LogicalResult ResolveLayoutConflicts::run() {
return WalkResult::advance();
});
- DBGS() << "IR after resolving layout conflicts:\n";
- parentOp->dump();
+ LLVM_DEBUG({
+ DBGS() << "IR after resolving layout conflicts:\n";
+ parentOp->dump();
+ });
return r.wasInterrupted() ? failure() : success();
}
More information about the Mlir-commits
mailing list