[Mlir-commits] [mlir] fe71ea4 - [MLIR][XeGPU] Preserve Leading dimension when blocking rank-sensitive operations (#177489)

Sat Jan 24 12:34:43 PST 2026

Author: Jianhui Li
Date: 2026-01-24T12:34:38-08:00
New Revision: fe71ea4437a8a6cc6e0b2e54e6d0a1fadd7ed029

URL: https://github.com/llvm/llvm-project/commit/fe71ea4437a8a6cc6e0b2e54e6d0a1fadd7ed029
DIFF: https://github.com/llvm/llvm-project/commit/fe71ea4437a8a6cc6e0b2e54e6d0a1fadd7ed029.diff

LOG: [MLIR][XeGPU] Preserve Leading dimension when blocking rank-sensitive operations (#177489)

This PR preserves leading dimensions for
xegpu.load_matrix/store_matrix/atomic_rmw/convert_layout, and vector
operations which have impact on shapes:
broadcast/multi-reduction/shape_cast/transpose.
Rank-sensitive operations are operations whose semantics depend on the
tensor rank (and consequently its shape), and therefore must not alter
the input tile rank or shape, such as by dropping leading dimensions.

Added: 
    

Modified: 
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
    mlir/test/Dialect/XeGPU/xegpu-blocking.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 931834ba16d9a..c00b7d42d48a6 100644

--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -137,28 +137,33 @@ template <typename T, typename>
 std::optional<SmallVector<int64_t>>
 XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
   Value value;
-  if constexpr (std::is_same_v<T, OpOperand>)
+  Operation *ownerOp;
+  if constexpr (std::is_same_v<T, OpOperand>) {
     value = operandOrResult.get();
-  else
+    ownerOp = operandOrResult.getOwner();
+  } else {
     value = (Value)operandOrResult;
+    ownerOp = value.getDefiningOp();
+  }
 
   xegpu::DistributeLayoutAttr layout =
       xegpu::getDistributeLayoutAttr(operandOrResult);
   if (layout && layout.isForSubgroup()) {
     if (!layout.getEffectiveInstDataAsInt().empty()) {
       SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
-      // Remove leading unit dimensions from inst_data
-      // For example, if the inst_data is [1, 1, 32]
-      // it will pass [32] as the unroll/blocking size.
-      // Skip it for xegpu nd ops since it will be 2D
-      // TODO: For vectors ops, experiment with the
-      // upstream vector remove leading unit dims patterns,
-      // populateCastAwayVectorLeadingOneDimPatterns.
-      Operation *definingOp = value.getDefiningOp();
+      // Remove leading unit dimensions from inst_data for non-rank-sensitive
+      // ops. For example, if the inst_data is [1, 1, 32] it will pass [32] as
+      // the unroll/blocking size.
+      // Skip it for rank-sensitive ops, whose semantics depend on the tensor
+      // rank (and consequently its shape), and therefore must not alter the
+      // input tile rank or shape, such as by dropping leading dimensions.
       bool skipLeadingUnitDimRemoval =
-          definingOp &&
-          (isa<xegpu::CreateNdDescOp, xegpu::LoadNdOp, xegpu::DpasOp,
-               xegpu::StoreNdOp, xegpu::PrefetchNdOp>(definingOp));
+          ownerOp &&
+          (isa<xegpu::CreateNdDescOp, xegpu::DpasOp, xegpu::ConvertLayoutOp,
+               xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::AtomicRMWOp,
+               xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
+               vector::TransposeOp, vector::ShapeCastOp,
+               vector::MultiDimReductionOp, vector::BroadcastOp>(ownerOp));
       if (!skipLeadingUnitDimRemoval) {
         auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
         instData.erase(instData.begin(), it);

diff  --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index f81865350ce6a..0b6e30e6f95f0 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -630,10 +630,10 @@ gpu.module @test_kernel {
 // -----
 gpu.module @test_kernel {
   // CHECK-LABEL: unroll_store_matrix
-  gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
+  gpu.func @unroll_store_matrix(%value: vector<1x32xf32>, %arg0 : memref<32768xi8, 3>) {
     %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
-    // CHECK-COUNT-8:  xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
-    xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+    // CHECK-COUNT-2:  xegpu.store_matrix {{.*}} : vector<1x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
+    xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [1, 16]>} : vector<1x32xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
 }