[Mlir-commits] [mlir] [MLIR][XeGPU] Add distribution patterns for vector insert & extract ops in sg to wi pass (PR #184665)
Nishant Patel
llvmlistbot at llvm.org
Tue Mar 10 11:59:22 PDT 2026
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/184665
>From 1edd303ac210212886f4f2802344d490d0ae9ee3 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 24 Feb 2026 21:39:21 +0000
Subject: [PATCH 1/4] Add distribution pattern for vector.insert &
vector.extract
---
.../XeGPUSgToWiDistributeExperimental.cpp | 70 ++++++++++++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 54 ++++++++++++++
2 files changed, 122 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..211af481f1fbe 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -522,6 +522,57 @@ struct LowerVectorMultiReductionPattern
}
};
+/// Distributes a subgroup-level vector.extract op to workitem-level. Only
+/// handles sub-vector extraction (result is VectorType, not scalar).
+struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
+ using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::ExtractOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Only handle vector results (not scalar extraction).
+ auto resultType = dyn_cast<VectorType>(op.getType());
+ if (!resultType)
+ return rewriter.notifyMatchFailure(op, "scalar extract not supported");
+
+ xegpu::DistributeLayoutAttr layout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!layout || !layout.isForSubgroup())
+ return failure();
+
+ auto newOp = vector::ExtractOp::create(
+ rewriter, op.getLoc(), adaptor.getSource(), op.getMixedPosition());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
+/// Distributes a subgroup-level vector.insert op to workitem-level. Only
+/// handles sub-vector insertion (value to store is VectorType, not scalar).
+struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
+ using OpConversionPattern<vector::InsertOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::InsertOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Only handle vector value-to-store (not scalar insertion).
+ auto valueType = dyn_cast<VectorType>(op.getValueToStoreType());
+ if (!valueType)
+ return rewriter.notifyMatchFailure(op, "scalar insert not supported");
+
+ xegpu::DistributeLayoutAttr layout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!layout || !layout.isForSubgroup())
+ return failure();
+
+ auto newOp = vector::InsertOp::create(
+ rewriter, op.getLoc(), adaptor.getValueToStore(), adaptor.getDest(),
+ op.getMixedPosition());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
struct XeGPUSgToWiDistributeExperimentalPass
: public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
XeGPUSgToWiDistributeExperimentalPass> {
@@ -727,11 +778,26 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
// Lane local reductions are illegal at this point and must be lowered.
return !isReductionLaneLocal(op);
});
+ // vector::ExtractOp is legal only if its result has no temporary layout
+ // attribute. Scalar extraction is always legal.
+ target.addDynamicallyLegalOp<vector::ExtractOp>(
+ [=](vector::ExtractOp op) -> bool {
+ if (!isa<VectorType>(op.getType()))
+ return true;
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
+ // vector::InsertOp is legal only if its result has no temporary layout
+ // attribute.
+ target.addDynamicallyLegalOp<vector::InsertOp>(
+ [=](vector::InsertOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
- SgToWiVectorReduction, SgToWiMultiDimReduction>(
- typeConverter, patterns.getContext());
+ SgToWiVectorReduction, SgToWiMultiDimReduction,
+ SgToWiVectorExtract, SgToWiVectorInsert>(typeConverter,
+ patterns.getContext());
}
void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..1afa786bdfe52 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -317,4 +317,58 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
[1] : vector<16x12xf32> to vector<16xf32>
gpu.return
}
+
+// CHECK-LABEL: gpu.func @vector_extract_from_2d
+// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[0] : vector<1xf32> from vector<4x1xf32>
+gpu.func @vector_extract_from_2d() {
+ %src = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<4x16xf32>
+ %0 = vector.extract %src[0]
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : vector<16xf32> from vector<4x16xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_extract_from_2d_idx2
+// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[2] : vector<1xf32> from vector<8x1xf32>
+gpu.func @vector_extract_from_2d_idx2() {
+ %src = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<8x16xf32>
+ %0 = vector.extract %src[2]
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : vector<16xf32> from vector<8x16xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_into_2d
+// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[0] : vector<1xf32> into vector<4x1xf32>
+gpu.func @vector_insert_into_2d() {
+ %val = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : () -> vector<16xf32>
+ %dst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<4x16xf32>
+ %0 = vector.insert %val, %dst[0]
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<16xf32> into vector<4x16xf32>
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_into_2d_idx2
+// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[2] : vector<1xf32> into vector<8x1xf32>
+gpu.func @vector_insert_into_2d_idx2() {
+ %val = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : () -> vector<16xf32>
+ %dst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<8x16xf32>
+ %0 = vector.insert %val, %dst[2]
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<16xf32> into vector<8x16xf32>
+ gpu.return
+}
}
>From 811e5e2170df32f42cbdc6697dc8a6aa9af61924 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 26 Feb 2026 01:40:03 +0000
Subject: [PATCH 2/4] Add distribution pattern for vector.insert_strided_slice
& vector.extract_strided_slice
---
.../XeGPUSgToWiDistributeExperimental.cpp | 200 ++++++++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 241 ++++++++++++++++++
2 files changed, 439 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 211af481f1fbe..b8310116a5507 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -547,6 +547,189 @@ struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
}
};
+/// Distributes a subgroup-level vector.extract_strided_slice op to
+/// workitem-level. If the result is distributed, the offsets and sizes are
+/// adjusted to match the distributed types. Supports both distributed and
+/// non-distributed cases.
+struct SgToWiVectorExtractStridedSlice
+ : public OpConversionPattern<vector::ExtractStridedSliceOp> {
+ using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::ExtractStridedSliceOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr resultLayout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!resultLayout || !resultLayout.isForSubgroup())
+ return failure();
+
+ VectorType resultType = op.getType();
+ auto distResultTyOrFailure =
+ xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);
+ // If distribution fails (e.g., dimension smaller than lane layout),
+ // the type stays unchanged (same behavior as TypeConverter).
+ VectorType distResultTy =
+ succeeded(distResultTyOrFailure) ? *distResultTyOrFailure : resultType;
+
+ // Find distributed dimensions by comparing original and distributed
+ // result types.
+ SmallVector<int64_t> distributedDims;
+ for (int64_t i = 0; i < resultType.getRank(); ++i) {
+ if (distResultTy.getDimSize(i) != resultType.getDimSize(i))
+ distributedDims.push_back(i);
+ }
+
+ // Collect updated sizes, offsets, strides. Pad to full source rank.
+ int64_t sourceRank = op.getSourceVectorType().getRank();
+ SmallVector<Attribute> updatedSizes =
+ llvm::map_to_vector(op.getSizes(), [](Attribute attr) { return attr; });
+ SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
+ op.getOffsets(), [](Attribute attr) { return attr; });
+ SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
+ op.getStrides(), [](Attribute attr) { return attr; });
+ for (int64_t i = op.getSizes().size(); i < sourceRank; ++i) {
+ updatedSizes.push_back(
+ rewriter.getI64IntegerAttr(op.getSourceVectorType().getDimSize(i)));
+ updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
+ updatedStrides.push_back(rewriter.getI64IntegerAttr(1));
+ }
+
+ // If the result is distributed, adjust offsets and sizes in the
+ // distributed dimension.
+ if (!distributedDims.empty()) {
+ if (distributedDims.size() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "only single dimension distribution is supported");
+ int64_t distDim = distributedDims[0];
+ auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));
+ if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
+ return rewriter.notifyMatchFailure(
+ op, "source of extract_strided_slice lacks distribution layout");
+ auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
+ int subgroupSize = sourceLaneLayout[distDim];
+ int sourceDistrDimSize = op.getSourceVectorType().getShape()[distDim];
+ if (sourceDistrDimSize % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ op, "source size along distributed dim is not a multiple of "
+ "subgroup size");
+ auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
+ if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
+ return rewriter.notifyMatchFailure(
+ op, "expecting unit lane data in source layout");
+ int64_t distrDimOffset =
+ cast<IntegerAttr>(updatedOffsets[distDim]).getInt();
+ if (distrDimOffset % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ op, "offset along distributed dim is not a multiple of "
+ "subgroup size");
+ // Adjust sizes and offsets for the distributed dimension.
+ updatedSizes[distDim] =
+ rewriter.getI64IntegerAttr(distResultTy.getDimSize(distDim));
+ updatedOffsets[distDim] =
+ rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
+ }
+
+ auto newOp = vector::ExtractStridedSliceOp::create(
+ rewriter, op.getLoc(), distResultTy, adaptor.getSource(),
+ ArrayAttr::get(rewriter.getContext(), updatedOffsets),
+ ArrayAttr::get(rewriter.getContext(), updatedSizes),
+ ArrayAttr::get(rewriter.getContext(), updatedStrides));
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
+/// Distributes a subgroup-level vector.insert_strided_slice op to
+/// workitem-level. If the dest is distributed, the offsets are adjusted to
+/// match the distributed types. Supports both distributed and non-distributed
+/// cases.
+struct SgToWiVectorInsertStridedSlice
+ : public OpConversionPattern<vector::InsertStridedSliceOp> {
+ using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(vector::InsertStridedSliceOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ xegpu::DistributeLayoutAttr resultLayout =
+ xegpu::getTemporaryLayout(op->getOpResult(0));
+ if (!resultLayout || !resultLayout.isForSubgroup())
+ return failure();
+
+ VectorType destType = op.getDestVectorType();
+ auto distDestTyOrFailure =
+ xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
+ // If distribution fails (e.g., dimension smaller than lane layout),
+ // the type stays unchanged (same behavior as TypeConverter).
+ VectorType distDestTy =
+ succeeded(distDestTyOrFailure) ? *distDestTyOrFailure : destType;
+
+ // Find distributed dimensions of the dest vector.
+ SmallVector<int64_t> destDistributedDims;
+ for (int64_t i = 0; i < destType.getRank(); ++i) {
+ if (distDestTy.getDimSize(i) != destType.getDimSize(i))
+ destDistributedDims.push_back(i);
+ }
+
+ SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
+ op.getOffsets(), [](Attribute attr) { return attr; });
+
+ if (!destDistributedDims.empty()) {
+ if (destDistributedDims.size() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "only single dimension distribution is supported");
+ int64_t destDistDim = destDistributedDims[0];
+
+ VectorType srcType = op.getSourceVectorType();
+ // The distributed dim must be in the last k (source rank) dims of dest.
+ int64_t sourceDistDim =
+ destDistDim - (destType.getRank() - srcType.getRank());
+ if (sourceDistDim < 0)
+ return rewriter.notifyMatchFailure(
+ op, "distributed dimension must be in the last k dims of dest");
+
+ auto destLayout = xegpu::getTemporaryLayout(op->getOpOperand(1));
+ auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));
+ if (!destLayout || !sourceLayout ||
+ destLayout.getEffectiveLaneLayoutAsInt().empty() ||
+ sourceLayout.getEffectiveLaneLayoutAsInt().empty())
+ return rewriter.notifyMatchFailure(
+ op, "source or dest of insert_strided_slice lacks distribution "
+ "layout");
+
+ int subgroupSize = destLayout.getEffectiveLaneLayoutAsInt()[destDistDim];
+ auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
+ auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
+ if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
+ !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
+ return rewriter.notifyMatchFailure(
+ op, "expecting unit lane data in source and dest layouts");
+
+ int64_t srcDistrDimSize = srcType.getDimSize(sourceDistDim);
+ if (srcDistrDimSize % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ op, "source distributed dim size is not a multiple of "
+ "subgroup size");
+
+ int64_t destDistrDimOffset =
+ cast<IntegerAttr>(op.getOffsets()[destDistDim]).getInt();
+ if (destDistrDimOffset % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ op, "offset along distributed dim is not a multiple of "
+ "subgroup size");
+ // Adjust offset for the distributed dimension.
+ updatedOffsets[destDistDim] =
+ rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
+ }
+
+ auto newOp = vector::InsertStridedSliceOp::create(
+ rewriter, op.getLoc(), distDestTy, adaptor.getValueToStore(),
+ adaptor.getDest(),
+ ArrayAttr::get(rewriter.getContext(), updatedOffsets), op.getStrides());
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
/// Distributes a subgroup-level vector.insert op to workitem-level. Only
/// handles sub-vector insertion (value to store is VectorType, not scalar).
struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
@@ -792,12 +975,25 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
[=](vector::InsertOp op) -> bool {
return !xegpu::getTemporaryLayout(op->getOpResult(0));
});
+ // vector::ExtractStridedSliceOp is legal only if its result has no temporary
+ // layout attribute.
+ target.addDynamicallyLegalOp<vector::ExtractStridedSliceOp>(
+ [=](vector::ExtractStridedSliceOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
+ // vector::InsertStridedSliceOp is legal only if its result has no temporary
+ // layout attribute.
+ target.addDynamicallyLegalOp<vector::InsertStridedSliceOp>(
+ [=](vector::InsertStridedSliceOp op) -> bool {
+ return !xegpu::getTemporaryLayout(op->getOpResult(0));
+ });
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
SgToWiVectorReduction, SgToWiMultiDimReduction,
- SgToWiVectorExtract, SgToWiVectorInsert>(typeConverter,
- patterns.getContext());
+ SgToWiVectorExtract, SgToWiVectorInsert,
+ SgToWiVectorExtractStridedSlice, SgToWiVectorInsertStridedSlice>(
+ typeConverter, patterns.getContext());
}
void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1afa786bdfe52..81e4697959438 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -371,4 +371,245 @@ gpu.func @vector_insert_into_2d_idx2() {
: vector<16xf32> into vector<8x16xf32>
gpu.return
}
+
+// extract_strided_slice: distributed dim fully extracted
+// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
+// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
+// Offsets [8,0] sizes [8,16] → [8,0] sizes [8,1]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<24x16xf32>
+ %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<24x16xf32> to vector<8x16xf32>
+ gpu.return
+}
+
+// extract_strided_slice: non-distributed (source already has unit dim)
+// Source: vector<24x1xf32> layout [1,16] → stays vector<24x1xf32>
+// Result: vector<8x1xf32> layout [1,16] → stays vector<8x1xf32>
+// Offsets and sizes unchanged
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_non_distributed() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<24x1xf32>
+ %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<24x1xf32> to vector<8x1xf32>
+ gpu.return
+}
+
+// extract_strided_slice: inner distributed (dim 1 distributed)
+// Source: vector<24x64xf32> layout [1,16] → distributed to vector<24x4xf32>
+// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
+// Offsets [8,48] → [8, 48/16=3], sizes [8,16] → [8,1]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_inner_distributed() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<24x64xf32>
+ %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<24x64xf32> to vector<8x16xf32>
+ gpu.return
+}
+
+// extract_strided_slice: outer distributed (dim 0 distributed)
+// Source: vector<32x16xf32> layout [16,1] → distributed to vector<2x16xf32>
+// Result: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
+// Offsets [16] padded to [16,0] → [16/16=1, 0], sizes [16] padded to [16,16] → [1,16]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
+gpu.func @vector_extract_strided_slice_outer_distributed() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : () -> vector<32x16xf32>
+ %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+ }
+ : vector<32x16xf32> to vector<16x16xf32>
+ gpu.return
+}
+
+// extract_strided_slice: 1D distributed
+// Source: vector<64xf32> layout [16] → distributed to vector<4xf32>
+// Result: vector<32xf32> layout [16] → distributed to vector<2xf32>
+// Offsets [16] → [16/16=1], sizes [32] → [2]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+gpu.func @vector_extract_strided_slice_1d() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : () -> vector<64xf32>
+ %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<64xf32> to vector<32xf32>
+ gpu.return
+}
+
+// extract_strided_slice: partial offsets (offsets rank < source rank)
+// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
+// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
+// Offsets [8] padded to [8,0], sizes [8] padded to [8,16] → [8,1]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_partial_offsets() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<24x16xf32>
+ %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<24x16xf32> to vector<8x16xf32>
+ gpu.return
+}
+
+// insert_strided_slice: distributed dim fully inserted (dim 1 distributed)
+// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
+// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
+// Offsets [24,0] → [24,0] (offset 0 / 16 = 0)
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
+gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<16x16xf32>
+ %1 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<64x16xf32>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16x16xf32> into vector<64x16xf32>
+ gpu.return
+}
+
+// insert_strided_slice: non-distributed (types already have unit dim)
+// Source: vector<16x1xf32> layout [1,16] → stays vector<16x1xf32>
+// Dest: vector<64x1xf32> layout [1,16] → stays vector<64x1xf32>
+// Offsets unchanged
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
+gpu.func @vector_insert_strided_slice_non_distributed() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<16x1xf32>
+ %1 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<64x1xf32>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16x1xf32> into vector<64x1xf32>
+ gpu.return
+}
+
+// insert_strided_slice: inner distributed (dim 1 distributed)
+// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
+// Dest: vector<64x32xf32> layout [1,16] → distributed to vector<64x2xf32>
+// Offsets [24,16] → [24, 16/16=1]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
+gpu.func @vector_insert_strided_slice_inner_distributed() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<16x16xf32>
+ %1 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<64x32xf32>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16x16xf32> into vector<64x32xf32>
+ gpu.return
+}
+
+// insert_strided_slice: outer distributed (dim 0 distributed)
+// Source: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
+// Dest: vector<48x32xf32> layout [16,1] → distributed to vector<3x32xf32>
+// Offsets [32,4] → [32/16=2, 4]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
+gpu.func @vector_insert_strided_slice_outer_distributed() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : () -> vector<16x16xf32>
+ %1 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : () -> vector<48x32xf32>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+ }
+ : vector<16x16xf32> into vector<48x32xf32>
+ gpu.return
+}
+
+// insert_strided_slice: 1D distributed
+// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
+// Dest: vector<48xf32> layout [16] → distributed to vector<3xf32>
+// Offsets [16] → [16/16=1]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
+gpu.func @vector_insert_strided_slice_1d() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : () -> vector<16xf32>
+ %1 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : () -> vector<48xf32>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<16xf32> into vector<48xf32>
+ gpu.return
+}
+
+// insert_strided_slice: different ranks (1D source into 2D dest)
+// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
+// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
+// Distributed dim 1, sourceDistDim = 1 - (2-1) = 0
+// Offsets [13,0] → [13, 0/16=0]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
+gpu.func @vector_insert_strided_slice_different_ranks() {
+ %0 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : () -> vector<16xf32>
+ %1 = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> vector<64x16xf32>
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16xf32> into vector<64x16xf32>
+ gpu.return
+}
}
>From ec7032a2dcc246ae88e326a13490a07d5cb79454 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 4 Mar 2026 19:02:07 +0000
Subject: [PATCH 3/4] Clean up
---
.../XeGPUSgToWiDistributeExperimental.cpp | 18 ++-----
.../XeGPU/sg-to-wi-experimental-unit.mlir | 49 -------------------
2 files changed, 4 insertions(+), 63 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 29c1e08427fd2..29f9e96df6c76 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -723,8 +723,7 @@ struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
/// Distributes a subgroup-level vector.extract_strided_slice op to
/// workitem-level. If the result is distributed, the offsets and sizes are
-/// adjusted to match the distributed types. Supports both distributed and
-/// non-distributed cases.
+/// adjusted to match the distributed types.
struct SgToWiVectorExtractStridedSlice
: public OpConversionPattern<vector::ExtractStridedSliceOp> {
using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
@@ -741,7 +740,7 @@ struct SgToWiVectorExtractStridedSlice
auto distResultTyOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);
// If distribution fails (e.g., dimension smaller than lane layout),
- // the type stays unchanged (same behavior as TypeConverter).
+ // the type stays unchanged.
VectorType distResultTy =
succeeded(distResultTyOrFailure) ? *distResultTyOrFailure : resultType;
@@ -815,8 +814,7 @@ struct SgToWiVectorExtractStridedSlice
/// Distributes a subgroup-level vector.insert_strided_slice op to
/// workitem-level. If the dest is distributed, the offsets are adjusted to
-/// match the distributed types. Supports both distributed and non-distributed
-/// cases.
+/// match the distributed types.
struct SgToWiVectorInsertStridedSlice
: public OpConversionPattern<vector::InsertStridedSliceOp> {
using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;
@@ -833,7 +831,7 @@ struct SgToWiVectorInsertStridedSlice
auto distDestTyOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
// If distribution fails (e.g., dimension smaller than lane layout),
- // the type stays unchanged (same behavior as TypeConverter).
+ // the type stays unchanged.
VectorType distDestTy =
succeeded(distDestTyOrFailure) ? *distDestTyOrFailure : destType;
@@ -1131,28 +1129,20 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
[=](vector::MultiDimReductionOp op) -> bool {
return !isValidSubgroupMultiReductionOp(op);
});
- // vector::ExtractOp is legal only if its result has no temporary layout
- // attribute. Scalar extraction is always legal.
target.addDynamicallyLegalOp<vector::ExtractOp>(
[=](vector::ExtractOp op) -> bool {
if (!isa<VectorType>(op.getType()))
return true;
return !xegpu::getTemporaryLayout(op->getOpResult(0));
});
- // vector::InsertOp is legal only if its result has no temporary layout
- // attribute.
target.addDynamicallyLegalOp<vector::InsertOp>(
[=](vector::InsertOp op) -> bool {
return !xegpu::getTemporaryLayout(op->getOpResult(0));
});
- // vector::ExtractStridedSliceOp is legal only if its result has no temporary
- // layout attribute.
target.addDynamicallyLegalOp<vector::ExtractStridedSliceOp>(
[=](vector::ExtractStridedSliceOp op) -> bool {
return !xegpu::getTemporaryLayout(op->getOpResult(0));
});
- // vector::InsertStridedSliceOp is legal only if its result has no temporary
- // layout attribute.
target.addDynamicallyLegalOp<vector::InsertStridedSliceOp>(
[=](vector::InsertStridedSliceOp op) -> bool {
return !xegpu::getTemporaryLayout(op->getOpResult(0));
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index af6827680145c..520edd4b98ee1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -515,10 +515,6 @@ gpu.func @vector_insert_into_2d_idx2() {
gpu.return
}
-// extract_strided_slice: distributed dim fully extracted
-// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
-// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
-// Offsets [8,0] sizes [8,16] → [8,0] sizes [8,1]
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
@@ -533,10 +529,6 @@ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
gpu.return
}
-// extract_strided_slice: non-distributed (source already has unit dim)
-// Source: vector<24x1xf32> layout [1,16] → stays vector<24x1xf32>
-// Result: vector<8x1xf32> layout [1,16] → stays vector<8x1xf32>
-// Offsets and sizes unchanged
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_non_distributed() {
@@ -551,10 +543,6 @@ gpu.func @vector_extract_strided_slice_non_distributed() {
gpu.return
}
-// extract_strided_slice: inner distributed (dim 1 distributed)
-// Source: vector<24x64xf32> layout [1,16] → distributed to vector<24x4xf32>
-// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
-// Offsets [8,48] → [8, 48/16=3], sizes [8,16] → [8,1]
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_inner_distributed() {
@@ -569,10 +557,6 @@ gpu.func @vector_extract_strided_slice_inner_distributed() {
gpu.return
}
-// extract_strided_slice: outer distributed (dim 0 distributed)
-// Source: vector<32x16xf32> layout [16,1] → distributed to vector<2x16xf32>
-// Result: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
-// Offsets [16] padded to [16,0] → [16/16=1, 0], sizes [16] padded to [16,16] → [1,16]
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
gpu.func @vector_extract_strided_slice_outer_distributed() {
@@ -587,10 +571,6 @@ gpu.func @vector_extract_strided_slice_outer_distributed() {
gpu.return
}
-// extract_strided_slice: 1D distributed
-// Source: vector<64xf32> layout [16] → distributed to vector<4xf32>
-// Result: vector<32xf32> layout [16] → distributed to vector<2xf32>
-// Offsets [16] → [16/16=1], sizes [32] → [2]
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
gpu.func @vector_extract_strided_slice_1d() {
@@ -605,10 +585,6 @@ gpu.func @vector_extract_strided_slice_1d() {
gpu.return
}
-// extract_strided_slice: partial offsets (offsets rank < source rank)
-// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
-// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
-// Offsets [8] padded to [8,0], sizes [8] padded to [8,16] → [8,1]
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_partial_offsets() {
@@ -623,10 +599,6 @@ gpu.func @vector_extract_strided_slice_partial_offsets() {
gpu.return
}
-// insert_strided_slice: distributed dim fully inserted (dim 1 distributed)
-// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
-// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
-// Offsets [24,0] → [24,0] (offset 0 / 16 = 0)
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
@@ -645,10 +617,6 @@ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
gpu.return
}
-// insert_strided_slice: non-distributed (types already have unit dim)
-// Source: vector<16x1xf32> layout [1,16] → stays vector<16x1xf32>
-// Dest: vector<64x1xf32> layout [1,16] → stays vector<64x1xf32>
-// Offsets unchanged
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
gpu.func @vector_insert_strided_slice_non_distributed() {
@@ -667,10 +635,6 @@ gpu.func @vector_insert_strided_slice_non_distributed() {
gpu.return
}
-// insert_strided_slice: inner distributed (dim 1 distributed)
-// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
-// Dest: vector<64x32xf32> layout [1,16] → distributed to vector<64x2xf32>
-// Offsets [24,16] → [24, 16/16=1]
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
gpu.func @vector_insert_strided_slice_inner_distributed() {
@@ -689,10 +653,6 @@ gpu.func @vector_insert_strided_slice_inner_distributed() {
gpu.return
}
-// insert_strided_slice: outer distributed (dim 0 distributed)
-// Source: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
-// Dest: vector<48x32xf32> layout [16,1] → distributed to vector<3x32xf32>
-// Offsets [32,4] → [32/16=2, 4]
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
gpu.func @vector_insert_strided_slice_outer_distributed() {
@@ -711,10 +671,6 @@ gpu.func @vector_insert_strided_slice_outer_distributed() {
gpu.return
}
-// insert_strided_slice: 1D distributed
-// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
-// Dest: vector<48xf32> layout [16] → distributed to vector<3xf32>
-// Offsets [16] → [16/16=1]
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
gpu.func @vector_insert_strided_slice_1d() {
@@ -733,11 +689,6 @@ gpu.func @vector_insert_strided_slice_1d() {
gpu.return
}
-// insert_strided_slice: different ranks (1D source into 2D dest)
-// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
-// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
-// Distributed dim 1, sourceDistDim = 1 - (2-1) = 0
-// Offsets [13,0] → [13, 0/16=0]
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
gpu.func @vector_insert_strided_slice_different_ranks() {
>From 9f2552d4d02f62c2f4fc0e130af068c3b592bba9 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 10 Mar 2026 18:36:00 +0000
Subject: [PATCH 4/4] Address feedback
---
.../XeGPUSgToWiDistributeExperimental.cpp | 65 +++++++++++--------
.../XeGPU/sg-to-wi-experimental-unit.mlir | 40 ++----------
2 files changed, 43 insertions(+), 62 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 29f9e96df6c76..c1173e22b96a0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -121,6 +121,20 @@ static bool isReductionLaneLocal(vector::MultiDimReductionOp op) {
return resTy != resDistTypeOrFailure.value();
}
+/// Given a vector type and its distributed vector type, return the list of
+/// dimensions that are distributed.
+static SmallVector<int64_t> getDistributedDims(VectorType originalType,
+ VectorType distributedType) {
+ assert(originalType.getRank() == distributedType.getRank() &&
+ "original and distributed vector types must have the same rank");
+ SmallVector<int64_t> distributedDims;
+ for (int64_t i = 0; i < originalType.getRank(); ++i) {
+ if (distributedType.getDimSize(i) != originalType.getDimSize(i))
+ distributedDims.push_back(i);
+ }
+ return distributedDims;
+}
+
/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
/// op. This simply drops the layout attribute from the tensor descriptor type.
struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
@@ -739,18 +753,13 @@ struct SgToWiVectorExtractStridedSlice
VectorType resultType = op.getType();
auto distResultTyOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);
- // If distribution fails (e.g., dimension smaller than lane layout),
- // the type stays unchanged.
- VectorType distResultTy =
- succeeded(distResultTyOrFailure) ? *distResultTyOrFailure : resultType;
-
- // Find distributed dimensions by comparing original and distributed
- // result types.
- SmallVector<int64_t> distributedDims;
- for (int64_t i = 0; i < resultType.getRank(); ++i) {
- if (distResultTy.getDimSize(i) != resultType.getDimSize(i))
- distributedDims.push_back(i);
- }
+ if (failed(distResultTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute distributed vector type from lane layout");
+ VectorType distResultTy = *distResultTyOrFailure;
+
+ SmallVector<int64_t> distributedDims =
+ getDistributedDims(resultType, distResultTy);
// Collect updated sizes, offsets, strides. Pad to full source rank.
int64_t sourceRank = op.getSourceVectorType().getRank();
@@ -774,12 +783,15 @@ struct SgToWiVectorExtractStridedSlice
return rewriter.notifyMatchFailure(
op, "only single dimension distribution is supported");
int64_t distDim = distributedDims[0];
+ const uArch *uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ op, "target attribute required to determine subgroup size");
+ int subgroupSize = uArch->getSubgroupSize();
auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));
if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
return rewriter.notifyMatchFailure(
op, "source of extract_strided_slice lacks distribution layout");
- auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
- int subgroupSize = sourceLaneLayout[distDim];
int sourceDistrDimSize = op.getSourceVectorType().getShape()[distDim];
if (sourceDistrDimSize % subgroupSize != 0)
return rewriter.notifyMatchFailure(
@@ -830,17 +842,13 @@ struct SgToWiVectorInsertStridedSlice
VectorType destType = op.getDestVectorType();
auto distDestTyOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
- // If distribution fails (e.g., dimension smaller than lane layout),
- // the type stays unchanged.
- VectorType distDestTy =
- succeeded(distDestTyOrFailure) ? *distDestTyOrFailure : destType;
-
- // Find distributed dimensions of the dest vector.
- SmallVector<int64_t> destDistributedDims;
- for (int64_t i = 0; i < destType.getRank(); ++i) {
- if (distDestTy.getDimSize(i) != destType.getDimSize(i))
- destDistributedDims.push_back(i);
- }
+ if (failed(distDestTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "unable to compute distributed vector type from lane layout");
+ VectorType distDestTy = *distDestTyOrFailure;
+
+ SmallVector<int64_t> destDistributedDims =
+ getDistributedDims(destType, distDestTy);
SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
op.getOffsets(), [](Attribute attr) { return attr; });
@@ -851,6 +859,12 @@ struct SgToWiVectorInsertStridedSlice
op, "only single dimension distribution is supported");
int64_t destDistDim = destDistributedDims[0];
+ const uArch *uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+ if (!uArch)
+ return rewriter.notifyMatchFailure(
+ op, "target attribute required to determine subgroup size");
+ int subgroupSize = uArch->getSubgroupSize();
+
VectorType srcType = op.getSourceVectorType();
// The distributed dim must be in the last k (source rank) dims of dest.
int64_t sourceDistDim =
@@ -868,7 +882,6 @@ struct SgToWiVectorInsertStridedSlice
op, "source or dest of insert_strided_slice lacks distribution "
"layout");
- int subgroupSize = destLayout.getEffectiveLaneLayoutAsInt()[destDistDim];
auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 520edd4b98ee1..c2c97c5f32a7c 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -473,9 +473,9 @@ gpu.func @vector_extract_from_2d() {
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_extract_from_2d_idx2
+// CHECK-LABEL: gpu.func @vector_extract_from_2d_offset2
// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[2] : vector<1xf32> from vector<8x1xf32>
-gpu.func @vector_extract_from_2d_idx2() {
+gpu.func @vector_extract_from_2d_offset2() {
%src = "some_op"()
{layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: () -> vector<8x16xf32>
@@ -500,9 +500,9 @@ gpu.func @vector_insert_into_2d() {
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_insert_into_2d_idx2
+// CHECK-LABEL: gpu.func @vector_insert_into_2d_offset2
// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[2] : vector<1xf32> into vector<8x1xf32>
-gpu.func @vector_insert_into_2d_idx2() {
+gpu.func @vector_insert_into_2d_offset2() {
%val = "some_op"()
{layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
: () -> vector<16xf32>
@@ -529,20 +529,6 @@ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
-// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-gpu.func @vector_extract_strided_slice_non_distributed() {
- %0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> vector<24x1xf32>
- %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<24x1xf32> to vector<8x1xf32>
- gpu.return
-}
-
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
gpu.func @vector_extract_strided_slice_inner_distributed() {
@@ -617,24 +603,6 @@ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
-// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
-gpu.func @vector_insert_strided_slice_non_distributed() {
- %0 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> vector<16x1xf32>
- %1 = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> vector<64x1xf32>
- %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x1xf32> into vector<64x1xf32>
- gpu.return
-}
-
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
gpu.func @vector_insert_strided_slice_inner_distributed() {
More information about the Mlir-commits
mailing list