[Mlir-commits] [mlir] a14f9f8 - [mlir][xegpu] Add support for accessing the default order of a layout. (#184451)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Mar 4 08:43:04 PST 2026
Author: Charitha Saumya
Date: 2026-03-04T08:42:58-08:00
New Revision: a14f9f822f4813a803eb21797fe90b6367683bcb
URL: https://github.com/llvm/llvm-project/commit/a14f9f822f4813a803eb21797fe90b6367683bcb
DIFF: https://github.com/llvm/llvm-project/commit/a14f9f822f4813a803eb21797fe90b6367683bcb.diff
LOG: [mlir][xegpu] Add support for accessing the default order of a layout. (#184451)
Currently, `getOrder` returns null if the user does not provide an
`order` in xegpu layout. This behavior is undesirable when coupled with
utility functions that work on top of layouts (like `isTransposeOf`).
This PR introduce a `getEffectiveOrder` which always returns the true
order, even if user decides to omit it.
Added:
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 377967dfdb1e5..6f667f4801673 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -217,6 +217,9 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
InterfaceMethod<"Get the effective LaneData of the layout attribute as integer array",
"SmallVector<int64_t>",
"getEffectiveLaneDataAsInt">,
+ InterfaceMethod<"Get the effective order of the layout attribute as integer array",
+ "SmallVector<int64_t>",
+ "getEffectiveOrderAsInt">,
InterfaceMethod<"Derive a new layout by dropping sgLayout and sgData",
"xegpu::DistributeLayoutAttr",
"dropSgLayoutAndData">,
@@ -236,15 +239,15 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
"FailureOr<SmallVector<Value>>",
"delinearizeId",
(ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
- InterfaceMethod<[{Derive a new layout with sg_data, inst_data and lane_data set to the
- specified values for the given dimension. Passing -1 for any parameter
+ InterfaceMethod<[{Derive a new layout with sg_data, inst_data and lane_data set to the
+ specified values for the given dimension. Passing -1 for any parameter
preserves its original value.}],
"xegpu::DistributeLayoutAttr",
"setDimData",
(ins "int64_t": $dim,
"int64_t": $sgData,
"int64_t": $instData,
- "int64_t": $laneData)>,
+ "int64_t": $laneData)>,
InterfaceMethod<[{Derive a new layout by collapsing dimensions.
`dimGroup` specifies a group of adjacent dimensions that are collapsed into
a single dimension in the derived layout.}],
@@ -297,13 +300,10 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
// Check laneData
if (!checkTranspose($_self.getEffectiveLaneDataAsInt(), other.getEffectiveLaneDataAsInt(), perm))
return false;
- // Check order if both sides have order field.
- if ($_self.getOrder() && other.getOrder()) {
- auto thisOrderAsInt = llvm::to_vector_of<int64_t>($_self.getOrder().asArrayRef());
- auto otherOrderAsInt = llvm::to_vector_of<int64_t>(other.getOrder().asArrayRef());
- if (!checkTranspose(thisOrderAsInt, otherOrderAsInt, perm))
- return false;
- }
+ // Check order
+ if (!checkTranspose($_self.getEffectiveOrderAsInt(), other.getEffectiveOrderAsInt(), perm))
+ return false;
+
return true;
}]>,
InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
@@ -484,7 +484,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
return !isForWorkgroup();
}
- int64_t getRank() {
+ int64_t getRank() const {
if (auto attr = getSgLayout())
return attr.size();
if (auto attr = getInstData())
@@ -540,14 +540,25 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
return {};
}
+ // Returns the order as integers. If order is not set, returns the default
+ // value [rank-1, ..., 0] (row-major, last dim is fastest).
+ SmallVector<int64_t> getEffectiveOrderAsInt() const {
+ if (DenseI32ArrayAttr order = getOrder())
+ return llvm::to_vector_of<int64_t>(order.asArrayRef());
+ int64_t rank = getRank();
+ SmallVector<int64_t> defaultOrder(rank);
+ std::iota(defaultOrder.rbegin(), defaultOrder.rend(), 0);
+ return defaultOrder;
+ }
+
//set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
DistributeLayoutAttr setUnitDimData(SmallVector<int64_t> unitDims) const;
//set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
DistributeLayoutAttr setUnitDimLayout(SmallVector<int64_t> unitDims) const;
- // Derive a new layout with sg_data, inst_data and lane_data set to the
- // specified values for the given dimension. Passing -1 for any parameter
+ // Derive a new layout with sg_data, inst_data and lane_data set to the
+ // specified values for the given dimension. Passing -1 for any parameter
// preserves its original value.
DistributeLayoutAttr setDimData(int64_t dim, int64_t sgData, int64_t instData, int64_t laneData);
@@ -702,6 +713,26 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
return {};
}
+ /// Returns the effective order of the attribute. The parent's effective
+ /// order (or default [rank-1,...,0]) is filtered by removing entries that
+ /// correspond to sliced dimensions, then renumbered to form a valid
+ /// permutation over the remaining dimensions.
+ /// Example: parent order = [2, 0, 1, 3], slice dims = [0, 2] -> result = [1, 0]
+ SmallVector<int64_t> getEffectiveOrderAsInt() const {
+ SliceAttr attr = flatten();
+ auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+ auto order = parent.getEffectiveOrderAsInt();
+ ArrayRef<int64_t> dims = attr.getDims().asArrayRef();
+ SmallVector<int64_t> result;
+ for (int64_t d : order) {
+ if (llvm::is_contained(dims, d))
+ continue;
+ int64_t offset = llvm::count_if(dims, [&](int64_t s) { return s < d; });
+ result.push_back(d - offset);
+ }
+ return result;
+ }
+
SliceAttr dropSgLayoutAndData() const{
SliceAttr attr = flatten();
auto parent = dyn_cast<LayoutAttr>(attr.getParent());
@@ -726,8 +757,8 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
//set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
DistributeLayoutAttr setUnitDimLayout(SmallVector<int64_t> unitDims) const;
- // Derive a new layout with sg_data, inst_data and lane_data set to the
- // specified values for the given dimension. Passing -1 for any parameter
+ // Derive a new layout with sg_data, inst_data and lane_data set to the
+ // specified values for the given dimension. Passing -1 for any parameter
// preserves its original value.
DistributeLayoutAttr setDimData(int64_t dim, int64_t sgData, int64_t instData, int64_t laneData);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 3c65091c3b10f..30e4a956a0add 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1521,13 +1521,6 @@ struct WgToSgVectorTransposeOp
SmallVector<int64_t> sourceSgLayout =
sourceLayout.getEffectiveSgLayoutAsInt();
SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
- DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
- DenseI32ArrayAttr resultOrder = layout.getOrder();
-
- if (!sourceOrder || !resultOrder) {
- return rewriter.notifyMatchFailure(
- op, "Both source and result must have order attributes");
- }
ArrayRef<int64_t> permutation = op.getPermutation();
size_t permutationSize = permutation.size();
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index dde58ba31860d..8b980c5083af3 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -567,11 +567,11 @@ gpu.func @create_memdesc(%laneid: index, %arg0 : memref<2048xi8, 3>) {
gpu.func @vector_transpose(%laneid: index) {
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
%cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
: () -> (vector<16x2xf32>)
%transpose = vector.transpose %cst, [1, 0]
{
- layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>,
+ layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1], order = [0, 1]>,
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}
: vector<16x2xf32> to vector<2x16xf32>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 5cf4ae64a0fd4..b8c92ec8d6f87 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -201,10 +201,10 @@ gpu.module @xevm_module{
%1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
%2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
- -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
- %3 = xegpu.load_nd %2[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
- %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+ -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
+ %3 = xegpu.load_nd %2[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
+ : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>> -> vector<16x8xi32>
+ %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2], order = [0, 1]>}
: vector<16x8xi32> to vector<16x16xf16>
%5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
: vector<16x16xf16> to vector<16x16xf16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index ecdfdb9ad34c5..29385406007ba 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -128,7 +128,8 @@ gpu.module @test_distribution {
-> vector<256x128xf32>
// CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32>
// CHECK-NOT: vector.transpose
- %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
+ %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<256x128xf32> to vector<128x256xf32>
gpu.return
}
@@ -146,7 +147,7 @@ gpu.module @test_distribution {
%cst16 = arith.constant 16 : index
%constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16]>} : vector<256x128xi1>
gpu.return
- }
+ }
// CHECK-LABEL: distribute_shapecast_expandunitdims_broadcast
// CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index fe9e3683edf7c..2b1655a7ac44f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -97,7 +97,7 @@ gpu.module @test_distribution {
%dpas = xegpu.dpas %load_a, %load_b
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>,
layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
+ layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
gpu.return
}
@@ -193,7 +193,7 @@ gpu.module @test_distribution {
%8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5)
-> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) {
// load_nd with offset inside loop
- %9 = xegpu.dpas %arg4, %arg5, %arg6
+ %9 = xegpu.dpas %arg4, %arg5, %arg6
{layout_a = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>,
layout_b = #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>,
layout_cd = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
@@ -479,7 +479,8 @@ gpu.module @test_distribution {
: !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
-> vector<256x32xf32>
//CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32>
- %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32>
+ %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<256x32xf32> to vector<32x256xf32>
gpu.return
}
@@ -852,7 +853,7 @@ gpu.module @test_distribution {
xegpu.store_nd %6, %2[%0, %1] <{layout = #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>}> : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>>
gpu.return
}
-
+
// CHECK-LABEL: convert_layout_slm
// CHECK-SAME: %[[ARG0:.*]]: memref<128x256xf32>
gpu.func @convert_layout_slm(%arg0: memref<128x256xf32>) {
More information about the Mlir-commits
mailing list