[Mlir-commits] [mlir] c72a01f - [MLIR][XeGPU] Consider alignment in dpas sg_layout creation (#181141)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Apr 30 07:31:01 PDT 2026
Author: Artem Kroviakov
Date: 2026-04-30T14:30:55Z
New Revision: c72a01f799f1525a2ced61c28a3c1f6577629d3e
URL: https://github.com/llvm/llvm-project/commit/c72a01f799f1525a2ced61c28a3c1f6577629d3e
DIFF: https://github.com/llvm/llvm-project/commit/c72a01f799f1525a2ced61c28a3c1f6577629d3e.diff
LOG: [MLIR][XeGPU] Consider alignment in dpas sg_layout creation (#181141)
Added:
Modified:
mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index d3925c40f9123..f91e80823c2e9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -1269,12 +1269,23 @@ getupDpasSubgroupLayouts(mlir::MLIRContext *context, VectorType aTy,
llvm::DenseSet<LayoutRepresentation> setCD(layoutsCD.begin(),
layoutsCD.end());
std::optional<LayoutRepresentation> bestPick;
+ auto checkAlignedSgDataAB = [&](LayoutRepresentation sgLayout) {
+ return aTy.getShape().back() / sgLayout.second ==
+ bTy.getShape().front() / sgLayout.first;
+ };
for (auto &sgLayout : layoutsB) {
if (setA.contains(sgLayout) && setCD.contains(sgLayout)) {
+ if (!checkAlignedSgDataAB(sgLayout))
+ continue;
+ // Is in (A and B and CD) and matches consumer -> best pick
if (consumerSgLayout.has_value() && sgLayout == *consumerSgLayout) {
bestPick = sgLayout;
break;
}
+ // Is in (A and B and CD) layoutsB is ordered from most
+ // balanced to least. So the first one we see is the most balanced one,
+ // remember it and later only update if there is one that matches the
+ // consumer.
if (!bestPick)
bestPick = sgLayout;
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index d4ad9087149c1..df09d7c4ab592 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -231,7 +231,7 @@ gpu.module @test {
// -----
gpu.module @test {
// CHECK-LABEL: for_loop_dpas
- gpu.func @for_loop_dpas(%arg0: memref<2048x8192xf16>, %arg1: memref<8192x4096xf16>, %arg2: memref<2048x4096xf32>) kernel attributes {known_block_size = array<i32: 8, 1, 16>} {
+ gpu.func @for_loop_dpas(%arg0: memref<2048x8192xf16>, %arg1: memref<8192x4096xf16>, %arg2: memref<2048x4096xf32>) kernel attributes {known_block_size = array<i32: 4, 1, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<128x128xf32>
%c128 = arith.constant 128 : index
%c8192 = arith.constant 8192 : index
@@ -243,25 +243,25 @@ gpu.module @test {
// CHECK: %2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %{{.*}}) -> (vector<128x128xf32>) {
// CHECK-NEXT: xegpu.create_nd_tdesc %{{.*}} : memref<2048x8192xf16> ->
// CHECK-SAME: !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>,
- // CHECK-SAME: #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 128]>>
+ // CHECK-SAME: #xegpu.layout<sg_layout = [2, 2], sg_data = [64, 128]>>
- // CHECK-NEXT: xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 128]>}>
+ // CHECK-NEXT: xegpu.load_nd %{{.*}} <{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [64, 128]>}>
// CHECK-NEXT: xegpu.create_nd_tdesc %{{.*}} : memref<8192x4096xf16> ->
// CHECK-SAME: !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>,
- // CHECK-SAME: #xegpu.layout<sg_layout = [2, 4], sg_data = [128, 32]>>
+ // CHECK-SAME: #xegpu.layout<sg_layout = [2, 2], sg_data = [128, 64]>>
- // CHECK-NEXT: xegpu.load_nd %6[%arg3, %block_id_y] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [128, 32]>}>
+ // CHECK-NEXT: xegpu.load_nd %6[%arg3, %block_id_y] <{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [128, 64]>}>
// CHECK-NEXT: xegpu.dpas %{{.*}} {
- // CHECK-SAME: layout_a = #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 128]>,
- // CHECK-SAME: layout_b = #xegpu.layout<sg_layout = [2, 4], sg_data = [128, 32]>,
- // CHECK-SAME: layout_cd = #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 32]>}
+ // CHECK-SAME: layout_a = #xegpu.layout<sg_layout = [2, 2], sg_data = [64, 128]>,
+ // CHECK-SAME: layout_b = #xegpu.layout<sg_layout = [2, 2], sg_data = [128, 64]>,
+ // CHECK-SAME: layout_cd = #xegpu.layout<sg_layout = [2, 2], sg_data = [64, 64]>}
// CHECK-SAME: : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
// CHECK-NEXT: scf.yield %{{.*}} : vector<128x128xf32>
- // CHECK-NEXT: } {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 32]>}
- // CHECK: xegpu.store_nd %{{.*}} <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 32]>}>
+ // CHECK-NEXT: } {layout_result_0 = #xegpu.layout<sg_layout = [2, 2], sg_data = [64, 64]>}
+ // CHECK: xegpu.store_nd %{{.*}} <{layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [64, 64]>}>
%2 = scf.for %arg3 = %c0 to %c8192 step %c128 iter_args(%arg4 = %cst) -> (vector<128x128xf32>) {
%4 = xegpu.create_nd_tdesc %arg0 : memref<2048x8192xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>>
@@ -277,6 +277,35 @@ gpu.module @test {
}
}
+// -----
+gpu.module @test {
+ // CHECK-LABEL: for_loop_misaligned_dpas_fail
+ gpu.func @for_loop_misaligned_dpas_fail(%arg0: memref<2048x8192xf16>, %arg1: memref<8192x4096xf16>, %arg2: memref<2048x4096xf32>) kernel attributes {known_block_size = array<i32: 8, 1, 16>} {
+ %cst = arith.constant dense<0.000000e+00> : vector<128x128xf32>
+ %c128 = arith.constant 128 : index
+ %c8192 = arith.constant 8192 : index
+ %c0 = arith.constant 0 : index
+ %block_id_x = gpu.block_id x
+ %block_id_y = gpu.block_id y
+ %0 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%block_id_x]
+ %1 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%block_id_y]
+ %2 = scf.for %arg3 = %c0 to %c8192 step %c128 iter_args(%arg4 = %cst) -> (vector<128x128xf32>) {
+ %4 = xegpu.create_nd_tdesc %arg0 : memref<2048x8192xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>>
+ %5 = xegpu.load_nd %4[%block_id_x, %arg3] : !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<128x128xf16>
+ %6 = xegpu.create_nd_tdesc %arg1 : memref<8192x4096xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>>
+ %7 = xegpu.load_nd %6[%arg3, %block_id_y] : !xegpu.tensor_desc<128x128xf16, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<128x128xf16>
+ // Couldn not find a layout whose sg_data would be aligned on the reduction dimension.
+ // CHECK: xegpu.dpas %{{.*}} {layout_cd = #xegpu.layout<sg_layout = [2, 4], sg_data = [64, 32]>} :
+ %8 = xegpu.dpas %5, %7, %arg4 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+ scf.yield %8 : vector<128x128xf32>
+ }
+ %3 = xegpu.create_nd_tdesc %arg2 : memref<2048x4096xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+ xegpu.store_nd %2, %3[%0, %1] : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+ gpu.return
+ }
+}
+
+
// -----
gpu.module @test {
// CHECK-LABEL: dpas_fails
More information about the Mlir-commits
mailing list