[Mlir-commits] [mlir] [mlir][xegpu] Add support for `vector.reduction` and `vector.multi_reduction` subgroup to work-item distribution. (PR #180308)
Artem Kroviakov
llvmlistbot at llvm.org
Tue Feb 10 01:46:07 PST 2026
================
@@ -362,6 +395,137 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
}
};
+/// This pattern distributes a subgroup-level vector.reduction op to
+/// workitem-level. This require shuffling the data across the workitems (using
+/// gpu::ShuffleOp) and reducing in stages until all workitems have the final
+/// result.
+struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
+ using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::ReductionOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto layout = xegpu::getDistributeLayoutAttr(op.getVector());
+
+ // If no layout, nothing to do.
+ if (!layout || !layout.isForSubgroup())
+ return failure();
+
+ VectorType vectorType = op.getSourceVectorType();
+
+ // Only rank 1 vectors supported.
+ if (vectorType.getRank() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "Only rank 1 reductions can be distributed.");
+ // Lane layout must have the same rank as the vector.
+ if (layout.getRank() != vectorType.getRank())
+ return rewriter.notifyMatchFailure(
+ op, "Layout rank does not match vector rank.");
+
+ // Get the subgroup size from the layout.
+ int64_t sgSize = layout.getEffectiveLaneLayoutAsInt()[0];
+
+ // Only subgroup-sized vectors supported.
+ if (vectorType.getShape()[0] % sgSize != 0)
+ return rewriter.notifyMatchFailure(
+ op, "Reduction vector dimension must match subgroup size.");
+
+ if (!op.getType().isIntOrFloat())
+ return rewriter.notifyMatchFailure(
+ op, "Reduction distribution currently only supports floats and "
+ "integer types.");
+
+ // Get the distributed vector (per work-item portion).
+ Value laneValVec = adaptor.getVector();
+
+ // Distribute and reduce across work-items in the subgroup.
+ Value fullReduce = xegpu::subgroupReduction(
+ op.getLoc(), rewriter, laneValVec, op.getKind(), sgSize);
+
+ // If there's an accumulator, combine it with the reduced value.
+ if (adaptor.getAcc())
+ fullReduce = vector::makeArithReduction(
+ rewriter, op.getLoc(), op.getKind(), fullReduce, adaptor.getAcc());
+
+ rewriter.replaceOp(op, fullReduce);
+ return success();
+ }
+};
+
+/// This pattern distributes a subgroup-level vector.multi_reduction op to
+/// workitem-level only if the reduction is lane-local. This means that
+/// reduction dimension is not distributed to lanes and each lane does its own
+/// local reduction.
+struct SgToWiMultiDimReduction
+ : public OpConversionPattern<vector::MultiDimReductionOp> {
+ using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::MultiDimReductionOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Check if the reduction op is valid for distribution.
+ if (!isValidSubgroupMultiReductionOp(op))
+ return rewriter.notifyMatchFailure(
+ op,
+ "Not a valid subgroup multi reduction op that can be distributed.");
+ // Only lane-local reduction is handled here.
+ if (!isReductionLaneLocal(op))
+ return rewriter.notifyMatchFailure(
+ op, "Only lane-local reduction is supported, expected reduction "
+ "dimension to be "
+ "not distributed.");
+ auto resLayout = xegpu::getTemporaryLayout(op->getOpResult(0));
+ VectorType resVecTy = dyn_cast<VectorType>(op.getType());
+ auto resDistVecTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resLayout, resVecTy);
+ // Simply create a new MultiDimReductionOp using adaptor operands and the
+ // new result type.
+ auto newOp = vector::MultiDimReductionOp::create(
+ rewriter, op.getLoc(), resDistVecTyOrFailure.value(), op.getKind(),
+ adaptor.getSource(), adaptor.getAcc(), op.getReductionDims());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
+/// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
+/// of vector.extract_strided_slice, vector.reduction and
+/// vector.insert_strided_slice ops. This is used when the reduction dimension
+/// is distributed to lanes and a naive (lane-local) distribution is not
+/// possible. Then later on, these partilly lowered subgroup-level ops are
----------------
akroviakov wrote:
```suggestion
/// possible. Then later on, these partially lowered subgroup-level ops are
```
https://github.com/llvm/llvm-project/pull/180308
More information about the Mlir-commits
mailing list