[Mlir-commits] [mlir] 08895d2 - [MLIR][XeGPU] Distribute `vector.step` with sliced layout (#182010)

Fri Feb 20 03:02:33 PST 2026

Author: Artem Kroviakov
Date: 2026-02-20T11:02:29Z
New Revision: 08895d2450af6ef0c312547e6b764c20788c732c

URL: https://github.com/llvm/llvm-project/commit/08895d2450af6ef0c312547e6b764c20788c732c
DIFF: https://github.com/llvm/llvm-project/commit/08895d2450af6ef0c312547e6b764c20788c732c.diff

LOG: [MLIR][XeGPU] Distribute `vector.step` with sliced layout (#182010)

Added: 
    

Modified: 
    mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
    mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7ace00a746e21..8561b139dcedb 100644

--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -699,8 +699,6 @@ FailureOr<SmallVector<SmallVector<Value>>>
 SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
                                     Value linearId, ArrayRef<int64_t> shape) {
   assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
-  if (!isForWorkgroup())
-    return failure();
 
   SmallVector<int64_t> layout;
   SmallVector<int64_t> subShape;

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 99c2da386fab6..f05036deabe41 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
@@ -1988,6 +1989,77 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+/// Distribute a vector::StepOp with the sliced result layout.
+/// The sliced layout must have exactly 1 effective lane dimension.
+/// We completely resolve the vector::StepOp by computing the lane_data-sized
+/// subranges.
+struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::StepOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          warpOp, "warp result is not a vector::StepOp op");
+    auto stepOp = operand->get().getDefiningOp<vector::StepOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getTemporaryLayout(stepOp->getResult(0));
+    if (!resultLayout)
+      return rewriter.notifyMatchFailure(
+          stepOp, "the result vector of the step op lacks layout "
+                  "attribute");
+    auto sliceLayout = dyn_cast<xegpu::SliceAttr>(resultLayout);
+    if (!sliceLayout)
+      return rewriter.notifyMatchFailure(
+          stepOp, "the result layout must be a slice layout");
+    if (sliceLayout.getEffectiveLaneLayoutAsInt().size() != 1)
+      return rewriter.notifyMatchFailure(
+          stepOp, "expecting 1 dim in the effective result layout");
+
+    rewriter.setInsertionPointAfter(warpOp);
+    auto loc = stepOp.getLoc();
+    auto stepResultVecTy = stepOp.getResult().getType();
+    Value distributedVal = warpOp.getResult(operandIdx);
+    VectorType newVecTy = cast<VectorType>(distributedVal.getType());
+
+    auto laneDataBlockCoords = resultLayout.computeDistributedCoords(
+        rewriter, loc, warpOp.getLaneid(), stepResultVecTy.getShape());
+    if (failed(laneDataBlockCoords))
+      return rewriter.notifyMatchFailure(
+          stepOp, "failed to compute lane data block coordinates");
+
+    auto laneDataBlockCoordsVec = laneDataBlockCoords.value();
+    auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0];
+    assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) ==
+           newVecTy.getNumElements() / laneDataBlockLength);
+    SmallVector<Value> stepVals;
+    // For each lane_data block, reconstruct its sub-range
+    // from the range of SG-level vector.step. Example: vector.step
+    // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} :
+    // vector<16xindex>
+    // Each logical lane holds 4 elements as 2 blocks of 2 elements each.
+    // The blocks are round-robin distributed, so logical lane id 0
+    // holds values [0,1, 8,9].
+    for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) {
+      auto laneDataBlockStartCoord = laneDataBlockCoords[0];
+      stepVals.push_back(laneDataBlockStartCoord);
+      for (int i = 1; i < laneDataBlockLength; ++i) {
+        auto offset = arith::ConstantIndexOp::create(rewriter, loc, i);
+        stepVals.push_back(arith::AddIOp::create(
+            rewriter, loc, laneDataBlockStartCoord, offset));
+      }
+    }
+    assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() &&
+           "Expecting the number of step values to match the number of "
+           "elements in the vector");
+    auto stepOpVal =
+        vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals);
+    rewriter.replaceAllUsesWith(distributedVal, stepOpVal);
+    return success();
+  }
+};
+
 } // namespace
 
 namespace {
@@ -2014,8 +2086,9 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
            VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
-           SinkUniformOps>(patterns.getContext(),
-                           /*pattern benefit=*/PatternHierarchy::AboveRegular);
+           VectorStepSliceDistribution, SinkUniformOps>(
+          patterns.getContext(),
+          /*pattern benefit=*/PatternHierarchy::AboveRegular);
 }
 
 void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(

diff  --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index fb23f38b44b46..31bb6704eece9 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -1132,4 +1132,61 @@ gpu.func
     gpu.return
   }
 
+// CHECK-LABEL: gpu.func @vector_step_slice
+// CHECK:         (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
+// CHECK:         %[[LANE_ID_IN_SLICED_DIM:.*]] = arith.remui %[[LANE_ID]], %c16 : index
+// CHECK-NEXT:    %[[LANE_ID_IN_SLICED_DIM1:.*]] = arith.remui %[[LANE_ID_IN_SLICED_DIM]], %c16 : index
+// CHECK-NEXT:    %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = vector.broadcast %[[LANE_ID_IN_SLICED_DIM1]] : index to vector<1xindex>
+// CHECK-NEXT:    "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
+  gpu.func @vector_step_slice(%arg0: index) {
+    %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
+      %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex>
+      gpu.yield %5 : vector<16xindex>
+    }
+    "some_use"(%0) : (vector<1xindex>) -> ()
+    gpu.return
+  }
+
+  // CHECK-LABEL: gpu.func @vector_step_slice_unit
+  // CHECK:         (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
+  // CHECK-NEXT:    %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
+  // CHECK-NEXT:    "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
+  gpu.func @vector_step_slice_unit(%arg0: index) {
+    %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
+      %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex>
+      gpu.yield %5 : vector<1xindex>
+    }
+    "some_use"(%0) : (vector<1xindex>) -> ()
+    gpu.return
+  }
+
+  // CHECK-LABEL: gpu.func @vector_step_slice_multi_dist_unit
+  // CHECK:         (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
+  // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG:    %[[DIST_UNIT_SIZE:.*]] = arith.constant 8 : index
+  // CHECK-DAG:    %[[SG_LEVEL_VECSIZE:.*]] = arith.constant 16 : index
+  // CHECK-DAG:    %[[LANE_LAYOUT:.*]] = arith.constant 4 : index
+  // CHECK-DAG:    %[[LANE_DATA:.*]] = arith.constant 2 : index
+  // CHECK-DAG:    %[[LANE_DIST_UNIT_START_IDX:.*]] = arith.divui %[[LANE_ID]], %[[LANE_DATA]] : index
+  // CHECK-DAG:    %[[DIST_UNIT_0_IDX:.*]] = arith.remui %[[LANE_DIST_UNIT_START_IDX]], %[[LANE_LAYOUT]] : index
+  // CHECK-DAG:    %[[DIST_UNIT_0_OFFSET:.*]] = arith.muli %[[DIST_UNIT_0_IDX]], %[[LANE_DATA]] : index
+  // CHECK-DAG:    %[[DIST_UNIT_0_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_0_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
+  // CHECK-DAG:    %[[DIST_UNIT_1_OFFSET:.*]] = arith.addi %[[DIST_UNIT_0_OFFSET]], %[[DIST_UNIT_SIZE]] : index
+  // CHECK-DAG:    %[[DIST_UNIT_1_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_1_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
+  // CHECK-DAG:    %[[V6:.*]] = arith.addi %[[DIST_UNIT_0_SUBRANGE_START]], %[[C1]] : index
+  // CHECK-DAG:    %[[V7:.*]] = arith.addi %[[DIST_UNIT_1_SUBRANGE_START]], %[[C1]] : index
+  // CHECK-DAG:    %[[VEC:.*]] = vector.from_elements
+  // CHECK-SAME:     %[[DIST_UNIT_0_SUBRANGE_START]], %[[V6]],
+  // CHECK-SAME:     %[[DIST_UNIT_1_SUBRANGE_START]], %[[V7]]
+  // CHECK-SAME:     : vector<4xindex>
+  // CHECK-NEXT:    "some_use"(%[[VEC]]) : (vector<4xindex>) -> ()
+  gpu.func @vector_step_slice_multi_dist_unit(%arg0: index) {
+    %0 = gpu.warp_execute_on_lane_0(%arg0)[4] -> (vector<4xindex>) {
+      %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1,2,1]>, dims = [0, 2]>} : vector<16xindex>
+      gpu.yield %5 : vector<16xindex>
+    }
+    "some_use"(%0) : (vector<4xindex>) -> ()
+    gpu.return
+  }
+
 }