[Mlir-commits] [mlir] [MLIR][XeGPU] Add uniform values distribution pattern (PR #176737)

Mon Jan 26 03:26:52 PST 2026

https://github.com/akroviakov updated https://github.com/llvm/llvm-project/pull/176737

>From 9ab4768e074b5da16c17f681ea9beec9c9577356 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Sun, 18 Jan 2026 16:25:43 +0000
Subject: [PATCH 1/3] [MLIR][XeGPU] Add uniform values "distribution" pattern

---
 .../Vector/Transforms/VectorDistribute.cpp    |  15 ++-
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 101 ++++++++++++++++--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   4 +-
 .../XeGPU/subgroup-distribute-unit.mlir       |  57 ++++++----
 4 files changed, 142 insertions(+), 35 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 5334470e2e3a0..743fb51bab1ab 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -398,7 +398,8 @@ getInnerRegionEscapingValues(WarpExecuteOnLane0Op warpOp, Region &innerRegion,
       Type distType = operand->get().getType();
       if (auto vecType = dyn_cast<VectorType>(distType)) {
         AffineMap map = distributionMapFn(operand->get());
-        distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+        distType = getDistributedType(vecType, map,
+                                      map.isEmpty() ? 1 : warpOp.getWarpSize());
       }
       escapingValueTypes.push_back(operand->get().getType());
       escapingValueDistTypes.push_back(distType);
@@ -1886,7 +1887,9 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
         // Fallback to affine map if the dist result was not previously recorded
         distType = ifResultDistTypes.count(i)
                        ? ifResultDistTypes[i]
-                       : getDistributedType(vecType, map, warpOp.getWarpSize());
+                       : getDistributedType(
+                             vecType, map,
+                             map.isEmpty() ? 1 : newWarpOp.getWarpSize());
       }
       newIfOpDistResTypes.push_back(distType);
     }
@@ -2075,9 +2078,11 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
         // we can get the distributed type from `forResultDistTypes` map.
         // Otherwise, we compute it using distributionMapFn.
         AffineMap map = distributionMapFn(initArg);
-        distType = forResultDistTypes.count(i)
-                       ? forResultDistTypes[i]
-                       : getDistributedType(vecType, map, warpOp.getWarpSize());
+        distType =
+            forResultDistTypes.count(i)
+                ? forResultDistTypes[i]
+                : getDistributedType(vecType, map,
+                                     map.isEmpty() ? 1 : warpOp.getWarpSize());
       }
       newWarpOpDistTypes.push_back(distType);
     }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9113f00ac39f0..30e104271d2b7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -64,6 +64,14 @@ namespace {
 /// priorities to patterns.
 static constexpr unsigned regularPatternBenefit = 1;
 static constexpr unsigned highPatternBenefit = 2;
+static constexpr unsigned highestPatternBenefit = 3;
+
+enum PatternHierarchy : unsigned {
+  Regular = 1,
+  AboveRegular = 2,
+  High = 3,
+  Highest = 4
+};
 
 /// Helper function to get  distributed vector type for a source vector type
 /// according to the lane_layout. We simply divide each dimension of tensor
@@ -792,9 +800,10 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     // Collect offsets.
     for (size_t i = 1; i < newRetIndices.size(); ++i)
       newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
-    xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
-                                newPrefetchOperands, prefetchOp->getAttrs());
-    xegpu::removeLayoutAttrs(prefetchOp);
+    Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
+        rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
+        prefetchOp->getAttrs());
+    xegpu::removeLayoutAttrs(newPrefetchOp);
     rewriter.eraseOp(prefetchOp);
     return success();
   }
@@ -1208,6 +1217,77 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+// Sink SG-uniform ops. An op is uniform if any of the following is true:
+// 1. It has no vector operands.
+// 2. All of its vector operands and results are uniform.
+// Non-uniform vectors are handled by dedicated patterns.
+// This pattern must have a higher priority than distribution patterns,
+// because a distributable shape may be logically intended as uniform.
+struct SinkUniformOps final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    // Take the last op
+    Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
+    // Any ops with nested regions must be handled carefully in dedicated
+    // patterns.
+    if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
+      return failure();
+
+    int operandIdx = -1;
+    if (warpRegionPreYieldOp->getNumResults()) {
+      OpOperand *operand = getWarpResult(
+          warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
+      if (!operand)
+        return failure();
+      operandIdx = operand->getOperandNumber();
+      if (warpRegionPreYieldOp->getResult(0).getType() !=
+          warpOp.getResult(operandIdx).getType())
+        return rewriter.notifyMatchFailure(warpOp,
+                                           "The op result is not uniform.");
+    }
+
+    // The op must have at least no layout-based operands or results.
+    bool uniformValuesOnly =
+        llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
+          return !xegpu::getDistributeLayoutAttr(v);
+        });
+    uniformValuesOnly &=
+        llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
+          return !xegpu::getDistributeLayoutAttr(opr);
+        });
+    if (!uniformValuesOnly)
+      return rewriter.notifyMatchFailure(warpOp,
+                                         "Some values are not uniform.");
+
+    // Capture its operands and create a new warp op that yields them.
+    SmallVector<size_t> newRetIndices;
+    SmallVector<Value> operands =
+        llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
+    SmallVector<Type> operandTypes =
+        llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+
+    // Clone the op after the new warp op.
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    IRMapping operandMapper;
+    for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
+      operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
+                        newWarpOp->getResult(newOperandIdx));
+    Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
+    if (!clonedOp->getNumResults())
+      rewriter.eraseOp(warpRegionPreYieldOp);
+    else {
+      assert(operandIdx != -1 && "Expected a warp result for the operation");
+      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
+                                  clonedOp->getResult(0));
+    }
+    return success();
+  }
+};
+
 /// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
 /// VectorReductionOps. We also insert layouts for the newly created ops.
 static Value lowerToVectorReductions(TypedValue<VectorType> src,
@@ -2036,14 +2116,16 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
                StoreMatrixDistribution,
                MemrefExtractAlignedPointerAsIndexDistribution>(
       patterns.getContext(),
-      /*pattern benefit=*/regularPatternBenefit);
+      /*pattern benefit=*/PatternHierarchy::Regular);
   // For following patterns, we need to override the regular vector distribution
   // patterns. Therefore, assign higher benefit.
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
            VectorInsertStridedSliceDistribution, VectorBroadcastDistribution>(
           patterns.getContext(),
-          /*pattern benefit=*/highPatternBenefit);
+          /*pattern benefit=*/PatternHierarchy::AboveRegular);
+  patterns.add<SinkUniformOps>(patterns.getContext(),
+                               /*pattern benefit=*/PatternHierarchy::High);
 }
 
 void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
@@ -2094,10 +2176,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       return AffineMap::get(val.getContext());
     // Get the layout of the vector type.
     xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
-    // If no layout is specified, that means no distribution.
+    // If no layout is specified, assume uniform case (no distribution).
     if (!layout)
-      return AffineMap::getMultiDimMapWithTargets(vecRank, {},
-                                                  val.getContext());
+      return AffineMap::get(val.getContext());
     // Expecting vector and layout rank to match.
     assert(layout.getRank() == vecRank &&
            "Expecting vector and layout rank to match");
@@ -2133,11 +2214,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
 
   vector::populateDistributeReduction(
       patterns, warpReduction,
-      /*pattern benefit=*/regularPatternBenefit);
+      /*pattern benefit=*/PatternHierarchy::Regular);
 
   vector::populatePropagateWarpVectorDistributionPatterns(
       patterns, distributionFn, shuffleFn,
-      /*pattern benefit=*/regularPatternBenefit);
+      /*pattern benefit=*/PatternHierarchy::Regular);
   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
     signalPassFailure();
     return;
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 51783b41c4c96..e9d81dd4d1d81 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -361,9 +361,9 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
         continue;
       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
-        op->emitError("Could not find layout attribute for operand ")
+        op->emitWarning("Could not find layout attribute for operand ")
             << operand.getOperandNumber() << " of operation " << op->getName();
-        return WalkResult::interrupt();
+        continue;
       }
       xegpu::setDistributeLayoutAttr(operand, layout);
     }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index b136c89925682..a99f850de6175 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -16,7 +16,7 @@ gpu.func @store_nd_1d(%laneid: index) {
   gpu.warp_execute_on_lane_0(%laneid)[16] {
     %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
     %cst = "some_op"() : () -> vector<16xf32>
-    xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    xegpu.store_nd %cst, %0 [%c0] {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
       : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   }
   gpu.return
@@ -39,7 +39,7 @@ gpu.func @store_nd_2d(%laneid : index) {
   gpu.warp_execute_on_lane_0(%laneid)[16] {
     %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %cst = "some_op"() : () -> vector<16x16xf16>
-    xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    xegpu.store_nd %cst, %0 [%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   }
   gpu.return
@@ -60,7 +60,7 @@ gpu.func @load_nd_1d(%laneid: index) {
   %c0 = arith.constant 0 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
     %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+    %1 = xegpu.load_nd %0 [%c0]  {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
       !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
     gpu.yield %1 : vector<16xf32>
   }
@@ -84,7 +84,7 @@ gpu.func @load_nd_2d(%laneid: index) {
   %c0 = arith.constant 0 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
     %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    %1 = xegpu.load_nd %0[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
     gpu.yield %1 : vector<16x16xf16>
   }
@@ -112,7 +112,7 @@ gpu.func @load_nd_array_length(%laneid: index) {
   %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) {
     %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
       #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    %1 = xegpu.load_nd %0[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
       : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
         #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
     gpu.yield %1 : vector<2x16x16xf16>
@@ -192,7 +192,7 @@ gpu.func @prefetch_2d(%laneid: index) {
     %0 = "some_op"() : ()
       -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     xegpu.prefetch_nd %0[%c0, %c0]
-      <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
       : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   }
   gpu.return
@@ -215,7 +215,7 @@ gpu.func @prefetch_1d(%laneid: index) {
     %0 = "some_op"() : ()
       -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
     xegpu.prefetch_nd %0[%c0]
-      <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+      {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
       : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
   }
   gpu.return
@@ -285,14 +285,15 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
 
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
 // CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
 // CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
 // CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %cst : vector<16xf32> into f32
-// CHECK-NEXT:   %[[T4:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:   %[[T5:.*]] = vector.reduction <add>, %[[T4]], %cst : vector<16xf32> into f32
-// CHECK-NEXT:   %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32>
-// CHECK-NEXT:   gpu.yield %[[T6]] : vector<2xf32>
+// CHECK-NEXT:   %[[T4:.*]] = vector.insert %[[T3]], %cst_0 [0] : f32 into vector<2xf32>
+// CHECK-NEXT:   %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T6:.*]] = vector.reduction <add>, %[[T5]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T7:.*]] = vector.insert %[[T6]], %[[T4]] [1] : f32 into vector<2xf32>
+// CHECK-NEXT:   gpu.yield %[[T7]]
 // CHECK-NEXT: }
 gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
@@ -317,6 +318,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
 }
 
 
+
 // CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
 // CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
 // CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
@@ -354,18 +356,19 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
 
 
 // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK:     %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK:     %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>{{.*}}) {
 // CHECK:       %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32>
 // CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
 // CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
 // CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32>
 // CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
-// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK:       %[[T4:.*]] = vector.insert %[[T3]], %cst_0 [0] : f32 into vector<2xf32>
+// CHECK:       %[[T5:.*]] = vector.extract_strided_slice %[[SRC]]
 // CHECK-SAME:     {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]] {{.*}} : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
-// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
-// CHECK:       gpu.yield %[[T7]] : vector<2xf32>
+// CHECK:       %[[T6:.*]] = vector.shape_cast %[[T5]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T7:.*]] = vector.reduction <add>, %[[T6]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T8:.*]] = vector.insert %[[T7]], %[[T4]] [1] : f32 into vector<2xf32>
+// CHECK:       gpu.yield %[[T8]]
 // CHECK:     }
 gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
   %c0 = arith.constant 0 : index
@@ -1030,4 +1033,22 @@ gpu.func
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector_uniform
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x16xf16>, f16)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] : f16 to vector<16x16xf16>
+// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
+// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x16xf16>
+// CHECK: "some_use"(%[[RESULT]])
+  gpu.func @vector_shape_cast_scalar_to_vector_uniform(%arg0: index) {
+    %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x16xf16>) {
+      %1 = "some_def"() : () -> f16
+      %2 = vector.broadcast %1 : f16 to vector<16x16xf16>
+      gpu.yield %2 : vector<16x16xf16>
+    }
+    "some_use"(%0) : (vector<16x16xf16>) -> ()
+    gpu.return
+  }
+
 }

>From 55e4f7997363db450fa6034aca02bb0b83c617dc Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Mon, 19 Jan 2026 11:27:33 +0000
Subject: [PATCH 2/3] Cleanup

---
 .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp   | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 30e104271d2b7..c56793effb2f7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -62,10 +62,6 @@ namespace {
 /// In certain cases, we may need to favor XeGPU specific distribution patterns
 /// over generic vector distribution patterns. In such cases, we can assign
 /// priorities to patterns.
-static constexpr unsigned regularPatternBenefit = 1;
-static constexpr unsigned highPatternBenefit = 2;
-static constexpr unsigned highestPatternBenefit = 3;
-
 enum PatternHierarchy : unsigned {
   Regular = 1,
   AboveRegular = 2,
@@ -1217,9 +1213,8 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
-// Sink SG-uniform ops. An op is uniform if any of the following is true:
-// 1. It has no vector operands.
-// 2. All of its vector operands and results are uniform.
+// Sink SG-uniform ops. An op is uniform if any of the following is none
+// of its operands/results has a distribution layout attribute.
 // Non-uniform vectors are handled by dedicated patterns.
 // This pattern must have a higher priority than distribution patterns,
 // because a distributable shape may be logically intended as uniform.
@@ -1233,7 +1228,6 @@ struct SinkUniformOps final : public gpu::WarpDistributionPattern {
     // patterns.
     if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
       return failure();
-
     int operandIdx = -1;
     if (warpRegionPreYieldOp->getNumResults()) {
       OpOperand *operand = getWarpResult(
@@ -1259,8 +1253,6 @@ struct SinkUniformOps final : public gpu::WarpDistributionPattern {
     if (!uniformValuesOnly)
       return rewriter.notifyMatchFailure(warpOp,
                                          "Some values are not uniform.");
-
-    // Capture its operands and create a new warp op that yields them.
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands =
         llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
@@ -1269,9 +1261,7 @@ struct SinkUniformOps final : public gpu::WarpDistributionPattern {
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypes, newRetIndices);
 
-    // Clone the op after the new warp op.
     rewriter.setInsertionPointAfter(newWarpOp);
-
     IRMapping operandMapper;
     for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
       operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),

>From da65a9bc4f112c83b40b7f9ce3cad65ea7e6d3c0 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Mon, 26 Jan 2026 11:21:42 +0000
Subject: [PATCH 3/3] Remove hierarchy level

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index c56793effb2f7..ac8c02827c1d4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -62,12 +62,7 @@ namespace {
 /// In certain cases, we may need to favor XeGPU specific distribution patterns
 /// over generic vector distribution patterns. In such cases, we can assign
 /// priorities to patterns.
-enum PatternHierarchy : unsigned {
-  Regular = 1,
-  AboveRegular = 2,
-  High = 3,
-  Highest = 4
-};
+enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };
 
 /// Helper function to get  distributed vector type for a source vector type
 /// according to the lane_layout. We simply divide each dimension of tensor
@@ -1213,11 +1208,12 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
-// Sink SG-uniform ops. An op is uniform if any of the following is none
+// Sink SG-uniform ops. An op is uniform if none
 // of its operands/results has a distribution layout attribute.
 // Non-uniform vectors are handled by dedicated patterns.
-// This pattern must have a higher priority than distribution patterns,
-// because a distributable shape may be logically intended as uniform.
+// This pattern must have a higher priority than vector dialect distribution
+// patterns, because a distributable shape may be logically intended as
+// uniform (i.e., no layout), so we want to omit its distribution.
 struct SinkUniformOps final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -1241,7 +1237,7 @@ struct SinkUniformOps final : public gpu::WarpDistributionPattern {
                                            "The op result is not uniform.");
     }
 
-    // The op must have at least no layout-based operands or results.
+    // The op must have no layout-based operands or results.
     bool uniformValuesOnly =
         llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
           return !xegpu::getDistributeLayoutAttr(v);
@@ -2111,11 +2107,9 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   // patterns. Therefore, assign higher benefit.
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
-           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution>(
-          patterns.getContext(),
-          /*pattern benefit=*/PatternHierarchy::AboveRegular);
-  patterns.add<SinkUniformOps>(patterns.getContext(),
-                               /*pattern benefit=*/PatternHierarchy::High);
+           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
+           SinkUniformOps>(patterns.getContext(),
+                           /*pattern benefit=*/PatternHierarchy::AboveRegular);
 }
 
 void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(