[Mlir-commits] [mlir] 074740d - [MLIR][XeGPU] bug fix: removing temporary slice layout at the pass end (#172589)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Jan 14 09:57:18 PST 2026
Author: Jianhui Li
Date: 2026-01-14T09:57:14-08:00
New Revision: 074740df8a11b66b2aac24eefecd7eafbc6f9a5a
URL: https://github.com/llvm/llvm-project/commit/074740df8a11b66b2aac24eefecd7eafbc6f9a5a
DIFF: https://github.com/llvm/llvm-project/commit/074740df8a11b66b2aac24eefecd7eafbc6f9a5a.diff
LOG: [MLIR][XeGPU] bug fix: removing temporary slice layout at the pass end (#172589)
Removing temporary slice layout (besides the regular layout) at the end
of wg distribution and blocking pass.
The PR also drop sg_data/inst_data from anchor layouts in every
wg-to-sg/blocking/unrolling pattern.
---------
Signed-off-by: Dmitry Chigarev <dmitry.chigarev at intel.com>
Co-authored-by: Dmitry Chigarev <dmitry.chigarev at intel.com>
Added:
Modified:
mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 46d52516cbc15..6573343a8bc97 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -147,6 +147,15 @@ void removeLayoutAttr(const T &operandOrResult);
/// applied recursively to the contained operations
void removeLayoutAttrs(Operation *op);
+/// Updates the NamedAttribute sequence by dropping sg-layout and
+/// sg-data information from any DistributeLayoutAttr found.
+SmallVector<NamedAttribute>
+dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs);
+
+/// Updates the NamedAttribute sequence by dropping inst-data information from
+/// any DistributeLayoutAttr found.
+SmallVector<NamedAttribute> dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs);
+
/// [to-be-deprecated] Sets the DistributeLayoutAttr for a given OpResult
/// user should use setAnchorLayout instead
void setDistributeLayoutAttr(const OpResult &Result,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index f0581208c8a2b..931834ba16d9a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -416,14 +416,14 @@ void XeGPUBlockingPass::runOnOperation() {
// Remove the layout attributes cached per operands.
for (OpOperand &opr : op->getOpOperands()) {
std::string name = xegpu::getTemporaryLayoutName(opr);
- if (op->hasAttrOfType<xegpu::LayoutAttr>(name))
+ if (op->hasAttrOfType<xegpu::DistributeLayoutAttr>(name))
op->removeAttr(name);
}
// Update the layout attributes per result.
for (OpResult result : op->getOpResults()) {
std::string name = xegpu::getTemporaryLayoutName(result);
- if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
+ if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
op->removeAttr(name);
if (!isa<LoopLikeOpInterface>(op))
xegpu::setDistributeLayoutAttr(result, layout.dropInstData());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index af63f09cd5e4a..8f4e2bb0451d8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -253,7 +253,7 @@ struct UnrollPrefetchNdOp : public UnrollPattern<xegpu::PrefetchNdOp> {
if (!hasOffsets) {
for (auto t : convertedTdesc)
xegpu::PrefetchNdOp::create(rewriter, loc, TypeRange(), t,
- op->getAttrs());
+ xegpu::dropInstDataOnAttrs(op->getAttrs()));
} else {
auto createPrefetch = [&](SmallVector<OpFoldResult> offsets) -> Value {
xegpu::PrefetchNdOp::create(rewriter, loc, convertedTdesc[0], offsets,
@@ -303,8 +303,9 @@ struct UnrollLoadNdOp : public UnrollPattern<xegpu::LoadNdOp> {
if (!hasOffsets) {
for (auto t : convertedTdescs) {
- auto newOp = xegpu::LoadNdOp::create(rewriter, loc, newValueTy, t,
- op->getAttrs());
+ auto newOp =
+ xegpu::LoadNdOp::create(rewriter, loc, newValueTy, t,
+ xegpu::dropInstDataOnAttrs(op->getAttrs()));
newOps.push_back(newOp);
}
} else {
@@ -462,8 +463,9 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
if (tmpC)
operands.push_back(tmpC);
- tmpC = xegpu::DpasOp::create(rewriter, loc, vecTy, operands,
- op->getAttrs());
+ tmpC =
+ xegpu::DpasOp::create(rewriter, loc, vecTy, operands,
+ xegpu::dropInstDataOnAttrs(op->getAttrs()));
}
newOps.push_back(tmpC);
}
@@ -825,7 +827,8 @@ struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
for (auto t : convertedTdesc)
- xegpu::PrefetchOp::create(rewriter, loc, TypeRange(), t, op->getAttrs());
+ xegpu::PrefetchOp::create(rewriter, loc, TypeRange(), t,
+ xegpu::dropInstDataOnAttrs(op->getAttrs()));
rewriter.eraseOp(op);
return success();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 07572a4950760..db31d93da663c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -276,8 +276,9 @@ struct WgToSgLoadNdOp : public OpConversionPattern<xegpu::LoadNdOp> {
dyn_cast<xegpu::TensorDescType>(src.getType());
ArrayRef<int64_t> srcShape = tdescTy.getShape();
VectorType newResTy = VectorType::get(srcShape, tdescTy.getElementType());
- auto newLoadOp = xegpu::LoadNdOp::create(rewriter, op.getLoc(), newResTy,
- src, op->getAttrs());
+ auto newLoadOp = xegpu::LoadNdOp::create(
+ rewriter, op.getLoc(), newResTy, src,
+ xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs()));
newLoadOps.push_back(newLoadOp);
}
rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -473,8 +474,9 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
return failure();
for (auto src : adaptor.getTensorDesc())
- xegpu::PrefetchNdOp::create(rewriter, op.getLoc(), TypeRange(), src,
- op->getAttrs());
+ xegpu::PrefetchNdOp::create(
+ rewriter, op.getLoc(), TypeRange(), src,
+ xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs()));
rewriter.eraseOp(op);
return success();
}
@@ -563,16 +565,7 @@ struct WgToSgElementwiseOp : public ConversionPattern {
state.addTypes(newResultType);
// Copy all attributes, but update "layout_result_0" to drop
// sgLayout/sgData
- for (auto attr : op->getAttrs()) {
- if (auto layout =
- dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
- if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
- !layout.getEffectiveInstDataAsInt().empty())
- state.addAttribute(attr.getName(), layout.dropSgLayoutAndData());
- } else {
- state.addAttribute(attr.getName(), attr.getValue());
- }
- }
+ state.addAttributes(xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs()));
Operation *newOp = rewriter.create(state);
newResults.push_back(newOp->getResult(0));
}
@@ -1609,7 +1602,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
getOperation()->walk([](Operation *op) {
for (OpResult result : op->getOpResults()) {
std::string name = xegpu::getTemporaryLayoutName(result);
- if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
+ if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
op->removeAttr(name);
if (!isa<scf::IfOp, scf::ForOp, scf::WhileOp, scf::ConditionOp>(op)) {
if (auto newLayout = layout.dropSgLayoutAndData())
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index d3906e37ffbf1..51783b41c4c96 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -380,6 +380,42 @@ void xegpu::removeLayoutAttr(const T &operandOrResult) {
owner->removeAttr(name);
}
+SmallVector<NamedAttribute>
+xegpu::dropSgLayoutAndDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> out;
+ out.reserve(attrs.size());
+
+ for (auto attr : attrs) {
+ if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
+ auto newLayout = dist.dropSgLayoutAndData();
+ if (newLayout)
+ out.emplace_back(attr.getName(), newLayout);
+ } else {
+ out.push_back(attr);
+ }
+ }
+
+ return out;
+}
+
+SmallVector<NamedAttribute>
+xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> out;
+ out.reserve(attrs.size());
+
+ for (auto attr : attrs) {
+ if (auto dist = dyn_cast<xegpu::DistributeLayoutAttr>(attr.getValue())) {
+ auto newLayout = dist.dropInstData();
+ if (newLayout)
+ out.emplace_back(attr.getName(), newLayout);
+ } else {
+ out.push_back(attr);
+ }
+ }
+
+ return out;
+}
+
// Explicit instantiation for OpResult
template void
xegpu::removeLayoutAttr<mlir::OpResult>(const mlir::OpResult &result);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 2b35b75b8d4ea..f81865350ce6a 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -458,7 +458,7 @@ gpu.module @test_kernel {
// CHECK-LABEL: test_prefetch_load_store_update_chunk
// CHECK-SAME: [[arg0:%.+]]: ui64
// CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
- // CHECK-COUNT-4: xegpu.prefetch {{.*}} <{layout = #xegpu.layout<inst_data = [16, 2]>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
// CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
// CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<16x2xf32>
// CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
@@ -566,8 +566,8 @@ gpu.module @test_kernel {
//CHECK: [[c0:%.+]] = arith.constant 0 : index
//CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
//CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
- //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
- //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
+ //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>> -> vector<16x16xf16>
//CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>}> : vector<16x16xf16>
//CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
//CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 50081ed34fe78..4f29a686d301f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -203,8 +203,8 @@ gpu.module @test_1_1_assignment {
// CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
// CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
// CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
- // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>}> : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
- // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>}> : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+ // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+ // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
// CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
// CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
// CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
More information about the Mlir-commits
mailing list