[Mlir-commits] [mlir] 08895d2 - [MLIR][XeGPU] Distribute `vector.step` with sliced layout (#182010)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Feb 20 03:02:33 PST 2026
Author: Artem Kroviakov
Date: 2026-02-20T11:02:29Z
New Revision: 08895d2450af6ef0c312547e6b764c20788c732c
URL: https://github.com/llvm/llvm-project/commit/08895d2450af6ef0c312547e6b764c20788c732c
DIFF: https://github.com/llvm/llvm-project/commit/08895d2450af6ef0c312547e6b764c20788c732c.diff
LOG: [MLIR][XeGPU] Distribute `vector.step` with sliced layout (#182010)
Added:
Modified:
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7ace00a746e21..8561b139dcedb 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -699,8 +699,6 @@ FailureOr<SmallVector<SmallVector<Value>>>
SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
Value linearId, ArrayRef<int64_t> shape) {
assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
- if (!isForWorkgroup())
- return failure();
SmallVector<int64_t> layout;
SmallVector<int64_t> subShape;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 99c2da386fab6..f05036deabe41 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
@@ -1988,6 +1989,77 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
}
};
+/// Distribute a vector::StepOp with the sliced result layout.
+/// The sliced layout must have exactly 1 effective lane dimension.
+/// We completely resolve the vector::StepOp by computing the lane_data-sized
+/// subranges.
+struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::StepOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ warpOp, "warp result is not a vector::StepOp op");
+ auto stepOp = operand->get().getDefiningOp<vector::StepOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+ xegpu::DistributeLayoutAttr resultLayout =
+ xegpu::getTemporaryLayout(stepOp->getResult(0));
+ if (!resultLayout)
+ return rewriter.notifyMatchFailure(
+ stepOp, "the result vector of the step op lacks layout "
+ "attribute");
+ auto sliceLayout = dyn_cast<xegpu::SliceAttr>(resultLayout);
+ if (!sliceLayout)
+ return rewriter.notifyMatchFailure(
+ stepOp, "the result layout must be a slice layout");
+ if (sliceLayout.getEffectiveLaneLayoutAsInt().size() != 1)
+ return rewriter.notifyMatchFailure(
+ stepOp, "expecting 1 dim in the effective result layout");
+
+ rewriter.setInsertionPointAfter(warpOp);
+ auto loc = stepOp.getLoc();
+ auto stepResultVecTy = stepOp.getResult().getType();
+ Value distributedVal = warpOp.getResult(operandIdx);
+ VectorType newVecTy = cast<VectorType>(distributedVal.getType());
+
+ auto laneDataBlockCoords = resultLayout.computeDistributedCoords(
+ rewriter, loc, warpOp.getLaneid(), stepResultVecTy.getShape());
+ if (failed(laneDataBlockCoords))
+ return rewriter.notifyMatchFailure(
+ stepOp, "failed to compute lane data block coordinates");
+
+ auto laneDataBlockCoordsVec = laneDataBlockCoords.value();
+ auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0];
+ assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) ==
+ newVecTy.getNumElements() / laneDataBlockLength);
+ SmallVector<Value> stepVals;
+ // For each lane_data block, reconstruct its sub-range
+ // from the range of SG-level vector.step. Example: vector.step
+ // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} :
+ // vector<16xindex>
+ // Each logical lane holds 4 elements as 2 blocks of 2 elements each.
+ // The blocks are round-robin distributed, so logical lane id 0
+ // holds values [0,1, 8,9].
+ for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) {
+ auto laneDataBlockStartCoord = laneDataBlockCoords[0];
+ stepVals.push_back(laneDataBlockStartCoord);
+ for (int i = 1; i < laneDataBlockLength; ++i) {
+ auto offset = arith::ConstantIndexOp::create(rewriter, loc, i);
+ stepVals.push_back(arith::AddIOp::create(
+ rewriter, loc, laneDataBlockStartCoord, offset));
+ }
+ }
+ assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() &&
+ "Expecting the number of step values to match the number of "
+ "elements in the vector");
+ auto stepOpVal =
+ vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals);
+ rewriter.replaceAllUsesWith(distributedVal, stepOpVal);
+ return success();
+ }
+};
+
} // namespace
namespace {
@@ -2014,8 +2086,9 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
patterns
.add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
- SinkUniformOps>(patterns.getContext(),
- /*pattern benefit=*/PatternHierarchy::AboveRegular);
+ VectorStepSliceDistribution, SinkUniformOps>(
+ patterns.getContext(),
+ /*pattern benefit=*/PatternHierarchy::AboveRegular);
}
void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index fb23f38b44b46..31bb6704eece9 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -1132,4 +1132,61 @@ gpu.func
gpu.return
}
+// CHECK-LABEL: gpu.func @vector_step_slice
+// CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
+// CHECK: %[[LANE_ID_IN_SLICED_DIM:.*]] = arith.remui %[[LANE_ID]], %c16 : index
+// CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM1:.*]] = arith.remui %[[LANE_ID_IN_SLICED_DIM]], %c16 : index
+// CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = vector.broadcast %[[LANE_ID_IN_SLICED_DIM1]] : index to vector<1xindex>
+// CHECK-NEXT: "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
+ gpu.func @vector_step_slice(%arg0: index) {
+ %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
+ %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex>
+ gpu.yield %5 : vector<16xindex>
+ }
+ "some_use"(%0) : (vector<1xindex>) -> ()
+ gpu.return
+ }
+
+ // CHECK-LABEL: gpu.func @vector_step_slice_unit
+ // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
+ // CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
+ // CHECK-NEXT: "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
+ gpu.func @vector_step_slice_unit(%arg0: index) {
+ %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
+ %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex>
+ gpu.yield %5 : vector<1xindex>
+ }
+ "some_use"(%0) : (vector<1xindex>) -> ()
+ gpu.return
+ }
+
+ // CHECK-LABEL: gpu.func @vector_step_slice_multi_dist_unit
+ // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
+ // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+ // CHECK-DAG: %[[DIST_UNIT_SIZE:.*]] = arith.constant 8 : index
+ // CHECK-DAG: %[[SG_LEVEL_VECSIZE:.*]] = arith.constant 16 : index
+ // CHECK-DAG: %[[LANE_LAYOUT:.*]] = arith.constant 4 : index
+ // CHECK-DAG: %[[LANE_DATA:.*]] = arith.constant 2 : index
+ // CHECK-DAG: %[[LANE_DIST_UNIT_START_IDX:.*]] = arith.divui %[[LANE_ID]], %[[LANE_DATA]] : index
+ // CHECK-DAG: %[[DIST_UNIT_0_IDX:.*]] = arith.remui %[[LANE_DIST_UNIT_START_IDX]], %[[LANE_LAYOUT]] : index
+ // CHECK-DAG: %[[DIST_UNIT_0_OFFSET:.*]] = arith.muli %[[DIST_UNIT_0_IDX]], %[[LANE_DATA]] : index
+ // CHECK-DAG: %[[DIST_UNIT_0_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_0_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
+ // CHECK-DAG: %[[DIST_UNIT_1_OFFSET:.*]] = arith.addi %[[DIST_UNIT_0_OFFSET]], %[[DIST_UNIT_SIZE]] : index
+ // CHECK-DAG: %[[DIST_UNIT_1_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_1_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
+ // CHECK-DAG: %[[V6:.*]] = arith.addi %[[DIST_UNIT_0_SUBRANGE_START]], %[[C1]] : index
+ // CHECK-DAG: %[[V7:.*]] = arith.addi %[[DIST_UNIT_1_SUBRANGE_START]], %[[C1]] : index
+ // CHECK-DAG: %[[VEC:.*]] = vector.from_elements
+ // CHECK-SAME: %[[DIST_UNIT_0_SUBRANGE_START]], %[[V6]],
+ // CHECK-SAME: %[[DIST_UNIT_1_SUBRANGE_START]], %[[V7]]
+ // CHECK-SAME: : vector<4xindex>
+ // CHECK-NEXT: "some_use"(%[[VEC]]) : (vector<4xindex>) -> ()
+ gpu.func @vector_step_slice_multi_dist_unit(%arg0: index) {
+ %0 = gpu.warp_execute_on_lane_0(%arg0)[4] -> (vector<4xindex>) {
+ %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1,2,1]>, dims = [0, 2]>} : vector<16xindex>
+ gpu.yield %5 : vector<16xindex>
+ }
+ "some_use"(%0) : (vector<4xindex>) -> ()
+ gpu.return
+ }
+
}
More information about the Mlir-commits
mailing list