[Mlir-commits] [mlir] [MLIR][XeGPU] Disable block count usage in layout propagation (PR #168504)
Artem Kroviakov
llvmlistbot at llvm.org
Fri Nov 21 02:38:53 PST 2025
https://github.com/akroviakov updated https://github.com/llvm/llvm-project/pull/168504
>From 3358a59c590cb712cfe46edf1e62ab7dcbcee6b0 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 18 Nov 2025 08:43:56 +0000
Subject: [PATCH 1/3] [MLIR][XeGPU] Disable block count usage in layout
propagation
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index b3a780abd3f12..6b3ba5a5981ce 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -495,8 +495,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
auto [bWidth, bHeight, bCount] = blockWHC.value();
SmallVector<int> instData;
int instWidth = xegpu::getLargestDivisor(
- static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
- bCount);
+ static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth);
if (instWidth == -1)
prefetch.emitWarning(
"No suitable instruction multiple found for the given shape.");
@@ -702,8 +701,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
auto [bWidth, bHeight, bCount] = blockWHC.value();
SmallVector<int> instData;
int instWidth = xegpu::getLargestDivisor(
- static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
- bCount);
+ static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth);
if (instWidth == -1)
store.emitWarning(
"No suitable instruction multiple found for the given shape.");
>From 161303c8562796c03c5a720477590fab2d42400a Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Thu, 20 Nov 2025 13:25:45 +0000
Subject: [PATCH 2/3] Add inst option description
---
mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 12270af870b3b..1fb01c3586e93 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -37,6 +37,18 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
propagate the layouts required for their operands to the producers. With
this propagated layout information, pass will then update op result type
with the layout information.
+
+ `inst`:
+ Sets the `inst_data` field of the layout attribute.
+ The goal is to select the highest granularity of the
+ instruction shape to minimize the number of instructions required
+ for processing the user shape.
+ For nd operations, the granularity depends on (W)idth, (H)eight and (B)lock count.
+ It is possible that the max. block count of one operation is different from the
+ max. allowed block count of another operation, whereas both operations
+ may use the same tensor descriptor. The propagation is not the right place for creating a
+ second tdesc with the correct block count and other related ops. Hence,
+ the subsequent optimization passes may decide on if or how to handle the block count.
}];
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
"vector::VectorDialect"];
>From 09f9755e6d18daff44233182523bfcaeca4f90b1 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Fri, 21 Nov 2025 10:38:35 +0000
Subject: [PATCH 3/3] Add test and description
---
.../mlir/Dialect/XeGPU/Transforms/Passes.td | 23 +++++++++---------
.../XeGPU/propagate-layout-inst-data.mlir | 24 +++++++++++++++++++
2 files changed, 36 insertions(+), 11 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 1fb01c3586e93..0ca58426ecfcb 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -38,17 +38,18 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
this propagated layout information, pass will then update op result type
with the layout information.
- `inst`:
- Sets the `inst_data` field of the layout attribute.
- The goal is to select the highest granularity of the
- instruction shape to minimize the number of instructions required
- for processing the user shape.
- For nd operations, the granularity depends on (W)idth, (H)eight and (B)lock count.
- It is possible that the max. block count of one operation is different from the
- max. allowed block count of another operation, whereas both operations
- may use the same tensor descriptor. The propagation is not the right place for creating a
- second tdesc with the correct block count and other related ops. Hence,
- the subsequent optimization passes may decide on if or how to handle the block count.
+ `layout-kind` option values:
+ - `inst`
+ Propagate the `inst_data` field of the layout attribute. The default is chosen to
+ maximize instruction-level granularity so that the user shape can be processed
+ with the fewest instructions. For N-D operations, this granularity depends on
+ W (width) and H (height) of the instruction shape.
+ The B (block) dimension (or array length) is not included in the default
+ configuration and must be enabled via a separate optimization pass.
+
+ - `lane`
+ Propagate the `lane_layout` and `lane_data` fields of the layout attribute.
+ Default values are selected to align with hardware.
}];
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
"vector::VectorDialect"];
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index c31ef323a94d2..0c837e17a0afa 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -1,5 +1,29 @@
// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @load_store_no_array_len(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<8x32xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x32xf32>
+// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout<inst_data = [8, 16]>>
+gpu.module @test {
+// Although the uArch allows 8x32 inst data using block count (or array_len),
+// it is up to optimization passes to decide on the block count usage.
+func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf32>) {
+ %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+ %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32>
+ %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32>
+ xegpu.store_nd %2, %1 : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32>
+ return
+}
+}
+
+// -----
+
// CHECK-LABEL: func.func @dpas_f16(
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
More information about the Mlir-commits
mailing list