[Mlir-commits] [mlir] [MLIR] [XeGPU] Add distribution patterns for vector transpose, bitcast & mask ops in sg to wi pass (PR #187392)
Nishant Patel
llvmlistbot at llvm.org
Mon Mar 23 15:45:53 PDT 2026
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/187392
>From 70490321392a35908560171c98aef136e2edf76f Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 19 Feb 2026 19:30:04 +0000
Subject: [PATCH 1/4] Add distribution pattern for vector.transpose &
vector.bitcast
---
.../XeGPUSgToWiDistributeExperimental.cpp | 86 ++++++++++++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 36 ++++++++
2 files changed, 121 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..c35daeb1937e3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -489,6 +489,79 @@ struct SgToWiMultiDimReduction
}
};
+/// Distributes a subgroup-level vector.transpose op to workitem-level.
+/// Only 2D transposes are supported. The result layout must be a transpose of
+/// the source layout. An equivalent vector::TransposeOp is created with
+/// distributed vector types.
+struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
+ using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::TransposeOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr sourceLayout =
+ xegpu::getTemporaryLayout(op->getOpOperand(0));
+ xegpu::DistributeLayoutAttr resultLayout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!sourceLayout || !resultLayout)
+ return rewriter.notifyMatchFailure(
+ op, "the source or result vector of the transpose op lacks layout "
+ "attribute");
+ int64_t sourceRank = op.getSourceVectorType().getRank();
+ int64_t resultRank = op.getResultVectorType().getRank();
+ // Only 2D transposes are supported.
+ if (sourceRank != 2 || resultRank != 2)
+ return rewriter.notifyMatchFailure(
+ op, "the source or result vector of the transpose op "
+ "does not have 2D layout");
+ ArrayRef<int64_t> perm = op.getPermutation();
+ // Result layout must be a transpose of source layout.
+ if (!resultLayout.isTransposeOf(sourceLayout, perm))
+ return rewriter.notifyMatchFailure(
+ op, "the source or result vector layouts must be 2D transposes of "
+ "each other");
+ FailureOr<VectorType> distributedResultTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());
+ if (failed(distributedResultTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to distribute the result vector type in "
+ "vector::Transpose op");
+ auto newOp = vector::TransposeOp::create(rewriter, op.getLoc(),
+ adaptor.getVector(), perm);
+ rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
+ distributedResultTypeOrFailure.value()));
+ return success();
+ }
+};
+
+/// Distributes a subgroup-level vector.bitcast op to workitem-level.
+/// Bitcast only impacts the innermost dimension of the source/result vectors.
+/// An equivalent vector::BitCastOp is created with distributed vector types.
+struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
+ using OpConversionPattern<vector::BitCastOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::BitCastOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr resultLayout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!resultLayout)
+ return rewriter.notifyMatchFailure(
+ op, "result vector of the bitcast op lacks layout attribute");
+ FailureOr<VectorType> distributedResultTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());
+ if (failed(distributedResultTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to distribute the result vector type in "
+ "vector::BitCast op");
+ auto newOp = vector::BitCastOp::create(
+ rewriter, op.getLoc(), distributedResultTypeOrFailure.value(),
+ adaptor.getSource());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
/// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
/// of vector.extract_strided_slice, vector.reduction and
/// vector.insert_strided_slice ops. This is used when the reduction dimension
@@ -727,10 +800,21 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
// Lane local reductions are illegal at this point and must be lowered.
return !isReductionLaneLocal(op);
});
+ // vector::TransposeOp is legal only if it has no result layout attribute.
+ target.addDynamicallyLegalOp<vector::TransposeOp>(
+ [=](vector::TransposeOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
+ // vector::BitCastOp is legal only if it has no result layout attribute.
+ target.addDynamicallyLegalOp<vector::BitCastOp>(
+ [=](vector::BitCastOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
- SgToWiVectorReduction, SgToWiMultiDimReduction>(
+ SgToWiVectorReduction, SgToWiMultiDimReduction,
+ SgToWiVectorTranspose, SgToWiVectorBitcast>(
typeConverter, patterns.getContext());
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..73645d2f7b252 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -317,4 +317,40 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
[1] : vector<16x12xf32> to vector<16xf32>
gpu.return
}
+
+// CHECK-LABEL: gpu.func @vector_transpose
+// CHECK: %[[SRC:.*]] = "some_op"()
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
+// CHECK-NEXT: %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+// CHECK-NEXT: gpu.return
+gpu.func @vector_transpose() {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : () -> (vector<16x2xf32>)
+ %transpose = vector.transpose %cst, [1, 0]
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16x2xf32> to vector<2x16xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_bitcast
+// CHECK: %[[SRC:.*]] = "some_op"()
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<4x32xi8> to vector<4x2xi8>
+// CHECK-NEXT: %[[BC:.*]] = vector.bitcast %[[CAST]] : vector<4x2xi8> to vector<4x1xi16>
+// CHECK-NEXT: gpu.return
+gpu.func @vector_bitcast() {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+ : () -> (vector<4x32xi8>)
+ %bitcast = vector.bitcast %cst
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<4x32xi8> to vector<4x16xi16>
+ gpu.return
+}
}
>From 44aba8df1c3d061a0fc9e2edc5f3919c632be0da Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 17 Mar 2026 22:11:26 +0000
Subject: [PATCH 2/4] Add patterns for CreateMask and ConstantMask
---
.../XeGPUSgToWiDistributeExperimental.cpp | 96 ++++++++++++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 48 ++++++++++
2 files changed, 142 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 9ca08a3ba53ce..fe5a143b11fa6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
@@ -744,6 +745,85 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
}
};
+/// Distributes a subgroup-level vector.create_mask or vector.constant_mask op
+/// to workitem-level. Each lane computes its own mask bounds based on its
+/// lane coordinates. For each dimension i, the new mask bound is:
+/// new_bound[i] = original_bound[i] - lane_coord[i] * dist_shape[i]
+/// vector.create_mask implicitly clamps to [0, vector_size].
+/// For constant_mask, the constant dim sizes are first materialized as
+/// Values, then the same logic applies, producing a vector.create_mask.
+template <typename OpType,
+ typename = std::enable_if_t<llvm::is_one_of<
+ OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>
+struct SgToWiCreateMask : public OpConversionPattern<OpType> {
+ using OpConversionPattern<OpType>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr layout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!layout || !layout.isForSubgroup())
+ return rewriter.notifyMatchFailure(
+ op, "operation result does not have subgroup distribute layout");
+
+ VectorType origType = op.getType();
+ FailureOr<VectorType> distTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, origType);
+ if (failed(distTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute workitem vector type from the layout");
+
+ VectorType distType = distTypeOrFailure.value();
+ Location loc = op.getLoc();
+
+ // Materialize the original mask operands as Values.
+ SmallVector<Value> origOperands;
+ if constexpr (std::is_same_v<OpType, vector::CreateMaskOp>) {
+ origOperands.append(op.getOperands().begin(), op.getOperands().end());
+ } else {
+ auto dimSizes = op.getMaskDimSizesAttr().asArrayRef();
+ for (auto dimSize : dimSizes)
+ origOperands.push_back(
+ arith::ConstantIndexOp::create(rewriter, loc, dimSize).getResult());
+ }
+
+ ArrayRef<int64_t> origShape = origType.getShape();
+ ArrayRef<int64_t> distShape = distType.getShape();
+
+ // Delinearize lane ID using the layout.
+ Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
+ /*upperBound=*/mlir::IntegerAttr());
+ auto maybeIds = layout.delinearizeId(rewriter, loc, laneId);
+ if (failed(maybeIds))
+ return rewriter.notifyMatchFailure(
+ op, "failed to delinearize lane ID from layout");
+ SmallVector<Value> laneIds = maybeIds.value();
+
+ // Compute new mask operands.
+ AffineExpr s0, s1;
+ bindSymbols(rewriter.getContext(), s0, s1);
+ SmallVector<Value> newOperands;
+ for (int i = 0, e = distShape.size(); i < e; ++i) {
+ if (origShape[i] == distShape[i]) {
+ // Dimension is not distributed, keep the original operand.
+ newOperands.push_back(origOperands[i]);
+ } else {
+ // new_bound = original_bound - lane_coord * dist_size
+ Value maskDimIdx = affine::makeComposedAffineApply(
+ rewriter, loc, s1 - s0 * distShape[i],
+ {laneIds[i], origOperands[i]});
+ newOperands.push_back(maskDimIdx);
+ }
+ }
+
+ auto newMask =
+ vector::CreateMaskOp::create(rewriter, loc, distType, newOperands);
+ rewriter.replaceOp(op, newMask.getResult());
+ return success();
+ }
+};
+
/// This pattern distributes a subgroup-level StoreMatrix op to workitem-level.
struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
@@ -1120,6 +1200,16 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
[=](vector::MultiDimReductionOp op) -> bool {
return !isValidSubgroupMultiReductionOp(op);
});
+ // vector::CreateMaskOp is legal only if it has no result layout attribute.
+ target.addDynamicallyLegalOp<vector::CreateMaskOp>(
+ [=](vector::CreateMaskOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
+ // vector::ConstantMaskOp is legal only if it has no result layout attribute.
+ target.addDynamicallyLegalOp<vector::ConstantMaskOp>(
+ [=](vector::ConstantMaskOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
// vector::TransposeOp is legal only if it has no result layout attribute.
target.addDynamicallyLegalOp<vector::TransposeOp>(
[=](vector::TransposeOp op) -> bool {
@@ -1135,6 +1225,8 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
SgToWiLoadGather, SgToWiStoreScatter, SgToWiVectorReduction,
SgToWiMultiDimReduction, SgToWiLoadMatrix, SgToWiStoreMatrix,
- SgToWiConvertLayout, SgToWiVectorTranspose, SgToWiVectorBitcast>(
- typeConverter, patterns.getContext());
+ SgToWiConvertLayout, SgToWiVectorTranspose, SgToWiVectorBitcast,
+ SgToWiCreateMask<vector::CreateMaskOp>,
+ SgToWiCreateMask<vector::ConstantMaskOp>>(typeConverter,
+ patterns.getContext());
}
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 487d2a6dd7dfe..b6f579a6869f2 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -497,6 +497,54 @@ gpu.func @vector_bitcast() {
gpu.return
}
+// CHECK-LABEL: gpu.func @create_mask_1d
+// CHECK-SAME: (%[[M0:.*]]: index)
+// CHECK: %[[LANE:.*]] = gpu.lane_id
+// CHECK: %[[NEW_BOUND:.*]] = affine.apply
+// CHECK: %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+// CHECK: gpu.return
+gpu.func @create_mask_1d(%m0: index) {
+ %mask = vector.create_mask %m0
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : vector<16xi1>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @constant_mask_1d
+// CHECK: %[[LANE:.*]] = gpu.lane_id
+// CHECK: %[[NEW_BOUND:.*]] = affine.apply
+// CHECK: %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+// CHECK: gpu.return
+gpu.func @constant_mask_1d() {
+ %mask = vector.constant_mask [4]
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : vector<16xi1>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_mask_2d
+// CHECK-SAME: (%[[M0:.*]]: index, %[[M1:.*]]: index)
+// CHECK: %[[LANE:.*]] = gpu.lane_id
+// CHECK: vector.create_mask {{.*}} : vector<1x2xi1>
+// CHECK: gpu.return
+gpu.func @create_mask_2d(%m0: index, %m1: index) {
+ %mask = vector.create_mask %m0, %m1
+ {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
+ : vector<8x4xi1>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @constant_mask_2d
+// CHECK: %[[LANE:.*]] = gpu.lane_id
+// CHECK: vector.create_mask {{.*}} : vector<1x2xi1>
+// CHECK: gpu.return
+gpu.func @constant_mask_2d() {
+ %mask = vector.constant_mask [2, 3]
+ {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
+ : vector<8x4xi1>
+ gpu.return
+}
+
// CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible
// CHECK-NOT: xegpu.convert_layout
gpu.func @convert_layout_removed_when_compatible() {
>From 5bbf92bdf37b4b253fa4f0146d93292fa2647f08 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 18 Mar 2026 16:16:06 +0000
Subject: [PATCH 3/4] Clean up
---
.../XeGPUSgToWiDistributeExperimental.cpp | 21 +++----------------
1 file changed, 3 insertions(+), 18 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index fe5a143b11fa6..47c29d9d8ead6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -1200,24 +1200,9 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
[=](vector::MultiDimReductionOp op) -> bool {
return !isValidSubgroupMultiReductionOp(op);
});
- // vector::CreateMaskOp is legal only if it has no result layout attribute.
- target.addDynamicallyLegalOp<vector::CreateMaskOp>(
- [=](vector::CreateMaskOp op) -> bool {
- return !xegpu::getTemporaryLayout(op->getOpResult(0));
- });
- // vector::ConstantMaskOp is legal only if it has no result layout attribute.
- target.addDynamicallyLegalOp<vector::ConstantMaskOp>(
- [=](vector::ConstantMaskOp op) -> bool {
- return !xegpu::getTemporaryLayout(op->getOpResult(0));
- });
- // vector::TransposeOp is legal only if it has no result layout attribute.
- target.addDynamicallyLegalOp<vector::TransposeOp>(
- [=](vector::TransposeOp op) -> bool {
- return !xegpu::getTemporaryLayout(op->getOpResult(0));
- });
- // vector::BitCastOp is legal only if it has no result layout attribute.
- target.addDynamicallyLegalOp<vector::BitCastOp>(
- [=](vector::BitCastOp op) -> bool {
+ target.addDynamicallyLegalOp<vector::CreateMaskOp, vector::ConstantMaskOp,
+ vector::TransposeOp, vector::BitCastOp>(
+ [=](Operation *op) -> bool {
return !xegpu::getTemporaryLayout(op->getOpResult(0));
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
>From ac0d42c7731953a54a678e4c703e820011a18671 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 23 Mar 2026 22:45:28 +0000
Subject: [PATCH 4/4] Remove operand layouts
---
mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir | 2 --
1 file changed, 2 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a290c3c869c72..c876a844e8ae2 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -472,7 +472,6 @@ gpu.func @vector_transpose() {
: () -> (vector<16x2xf32>)
%transpose = vector.transpose %cst, [1, 0]
{
- layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>,
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}
: vector<16x2xf32> to vector<2x16xf32>
@@ -490,7 +489,6 @@ gpu.func @vector_bitcast() {
: () -> (vector<4x32xi8>)
%bitcast = vector.bitcast %cst
{
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}
: vector<4x32xi8> to vector<4x16xi16>
More information about the Mlir-commits
mailing list