[Mlir-commits] [mlir] [MLIR][XeGPU] Add distribution patterns for vector insert & extract ops in sg to wi pass (PR #184665)

Tue Mar 10 11:59:22 PDT 2026

https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/184665

>From 1edd303ac210212886f4f2802344d490d0ae9ee3 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 24 Feb 2026 21:39:21 +0000
Subject: [PATCH 1/4] Add distribution pattern for vector.insert &
 vector.extract

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 70 ++++++++++++++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 54 ++++++++++++++
 2 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..211af481f1fbe 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -522,6 +522,57 @@ struct LowerVectorMultiReductionPattern
   }
 };
 
+/// Distributes a subgroup-level vector.extract op to workitem-level. Only
+/// handles sub-vector extraction (result is VectorType, not scalar).
+struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
+  using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::ExtractOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only handle vector results (not scalar extraction).
+    auto resultType = dyn_cast<VectorType>(op.getType());
+    if (!resultType)
+      return rewriter.notifyMatchFailure(op, "scalar extract not supported");
+
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!layout || !layout.isForSubgroup())
+      return failure();
+
+    auto newOp = vector::ExtractOp::create(
+        rewriter, op.getLoc(), adaptor.getSource(), op.getMixedPosition());
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
+/// Distributes a subgroup-level vector.insert op to workitem-level. Only
+/// handles sub-vector insertion (value to store is VectorType, not scalar).
+struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
+  using OpConversionPattern<vector::InsertOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::InsertOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only handle vector value-to-store (not scalar insertion).
+    auto valueType = dyn_cast<VectorType>(op.getValueToStoreType());
+    if (!valueType)
+      return rewriter.notifyMatchFailure(op, "scalar insert not supported");
+
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!layout || !layout.isForSubgroup())
+      return failure();
+
+    auto newOp = vector::InsertOp::create(
+        rewriter, op.getLoc(), adaptor.getValueToStore(), adaptor.getDest(),
+        op.getMixedPosition());
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -727,11 +778,26 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         // Lane local reductions are illegal at this point and must be lowered.
         return !isReductionLaneLocal(op);
       });
+  // vector::ExtractOp is legal only if its result has no temporary layout
+  // attribute. Scalar extraction is always legal.
+  target.addDynamicallyLegalOp<vector::ExtractOp>(
+      [=](vector::ExtractOp op) -> bool {
+        if (!isa<VectorType>(op.getType()))
+          return true;
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
+  // vector::InsertOp is legal only if its result has no temporary layout
+  // attribute.
+  target.addDynamicallyLegalOp<vector::InsertOp>(
+      [=](vector::InsertOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
                SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
-               SgToWiVectorReduction, SgToWiMultiDimReduction>(
-      typeConverter, patterns.getContext());
+               SgToWiVectorReduction, SgToWiMultiDimReduction,
+               SgToWiVectorExtract, SgToWiVectorInsert>(typeConverter,
+                                                        patterns.getContext());
 }
 
 void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..1afa786bdfe52 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -317,4 +317,58 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
       [1] : vector<16x12xf32> to vector<16xf32>
   gpu.return
 }
+
+// CHECK-LABEL: gpu.func @vector_extract_from_2d
+// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[0] : vector<1xf32> from vector<4x1xf32>
+gpu.func @vector_extract_from_2d() {
+  %src = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<4x16xf32>
+  %0 = vector.extract %src[0]
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : vector<16xf32> from vector<4x16xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_extract_from_2d_idx2
+// CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[2] : vector<1xf32> from vector<8x1xf32>
+gpu.func @vector_extract_from_2d_idx2() {
+  %src = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<8x16xf32>
+  %0 = vector.extract %src[2]
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : vector<16xf32> from vector<8x16xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_into_2d
+// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[0] : vector<1xf32> into vector<4x1xf32>
+gpu.func @vector_insert_into_2d() {
+  %val = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : () -> vector<16xf32>
+  %dst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<4x16xf32>
+  %0 = vector.insert %val, %dst[0]
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<16xf32> into vector<4x16xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_into_2d_idx2
+// CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[2] : vector<1xf32> into vector<8x1xf32>
+gpu.func @vector_insert_into_2d_idx2() {
+  %val = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : () -> vector<16xf32>
+  %dst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<8x16xf32>
+  %0 = vector.insert %val, %dst[2]
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<16xf32> into vector<8x16xf32>
+  gpu.return
+}
 }

>From 811e5e2170df32f42cbdc6697dc8a6aa9af61924 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 26 Feb 2026 01:40:03 +0000
Subject: [PATCH 2/4] Add distribution pattern for vector.insert_strided_slice
 & vector.extract_strided_slice

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 200 ++++++++++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 241 ++++++++++++++++++
 2 files changed, 439 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 211af481f1fbe..b8310116a5507 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -547,6 +547,189 @@ struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
   }
 };
 
+/// Distributes a subgroup-level vector.extract_strided_slice op to
+/// workitem-level. If the result is distributed, the offsets and sizes are
+/// adjusted to match the distributed types. Supports both distributed and
+/// non-distributed cases.
+struct SgToWiVectorExtractStridedSlice
+    : public OpConversionPattern<vector::ExtractStridedSliceOp> {
+  using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::ExtractStridedSliceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!resultLayout || !resultLayout.isForSubgroup())
+      return failure();
+
+    VectorType resultType = op.getType();
+    auto distResultTyOrFailure =
+        xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);
+    // If distribution fails (e.g., dimension smaller than lane layout),
+    // the type stays unchanged (same behavior as TypeConverter).
+    VectorType distResultTy =
+        succeeded(distResultTyOrFailure) ? *distResultTyOrFailure : resultType;
+
+    // Find distributed dimensions by comparing original and distributed
+    // result types.
+    SmallVector<int64_t> distributedDims;
+    for (int64_t i = 0; i < resultType.getRank(); ++i) {
+      if (distResultTy.getDimSize(i) != resultType.getDimSize(i))
+        distributedDims.push_back(i);
+    }
+
+    // Collect updated sizes, offsets, strides. Pad to full source rank.
+    int64_t sourceRank = op.getSourceVectorType().getRank();
+    SmallVector<Attribute> updatedSizes =
+        llvm::map_to_vector(op.getSizes(), [](Attribute attr) { return attr; });
+    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
+        op.getOffsets(), [](Attribute attr) { return attr; });
+    SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
+        op.getStrides(), [](Attribute attr) { return attr; });
+    for (int64_t i = op.getSizes().size(); i < sourceRank; ++i) {
+      updatedSizes.push_back(
+          rewriter.getI64IntegerAttr(op.getSourceVectorType().getDimSize(i)));
+      updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
+      updatedStrides.push_back(rewriter.getI64IntegerAttr(1));
+    }
+
+    // If the result is distributed, adjust offsets and sizes in the
+    // distributed dimension.
+    if (!distributedDims.empty()) {
+      if (distributedDims.size() != 1)
+        return rewriter.notifyMatchFailure(
+            op, "only single dimension distribution is supported");
+      int64_t distDim = distributedDims[0];
+      auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));
+      if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
+        return rewriter.notifyMatchFailure(
+            op, "source of extract_strided_slice lacks distribution layout");
+      auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
+      int subgroupSize = sourceLaneLayout[distDim];
+      int sourceDistrDimSize = op.getSourceVectorType().getShape()[distDim];
+      if (sourceDistrDimSize % subgroupSize != 0)
+        return rewriter.notifyMatchFailure(
+            op, "source size along distributed dim is not a multiple of "
+                "subgroup size");
+      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
+      if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
+        return rewriter.notifyMatchFailure(
+            op, "expecting unit lane data in source layout");
+      int64_t distrDimOffset =
+          cast<IntegerAttr>(updatedOffsets[distDim]).getInt();
+      if (distrDimOffset % subgroupSize != 0)
+        return rewriter.notifyMatchFailure(
+            op, "offset along distributed dim is not a multiple of "
+                "subgroup size");
+      // Adjust sizes and offsets for the distributed dimension.
+      updatedSizes[distDim] =
+          rewriter.getI64IntegerAttr(distResultTy.getDimSize(distDim));
+      updatedOffsets[distDim] =
+          rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
+    }
+
+    auto newOp = vector::ExtractStridedSliceOp::create(
+        rewriter, op.getLoc(), distResultTy, adaptor.getSource(),
+        ArrayAttr::get(rewriter.getContext(), updatedOffsets),
+        ArrayAttr::get(rewriter.getContext(), updatedSizes),
+        ArrayAttr::get(rewriter.getContext(), updatedStrides));
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
+/// Distributes a subgroup-level vector.insert_strided_slice op to
+/// workitem-level. If the dest is distributed, the offsets are adjusted to
+/// match the distributed types. Supports both distributed and non-distributed
+/// cases.
+struct SgToWiVectorInsertStridedSlice
+    : public OpConversionPattern<vector::InsertStridedSliceOp> {
+  using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::InsertStridedSliceOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!resultLayout || !resultLayout.isForSubgroup())
+      return failure();
+
+    VectorType destType = op.getDestVectorType();
+    auto distDestTyOrFailure =
+        xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
+    // If distribution fails (e.g., dimension smaller than lane layout),
+    // the type stays unchanged (same behavior as TypeConverter).
+    VectorType distDestTy =
+        succeeded(distDestTyOrFailure) ? *distDestTyOrFailure : destType;
+
+    // Find distributed dimensions of the dest vector.
+    SmallVector<int64_t> destDistributedDims;
+    for (int64_t i = 0; i < destType.getRank(); ++i) {
+      if (distDestTy.getDimSize(i) != destType.getDimSize(i))
+        destDistributedDims.push_back(i);
+    }
+
+    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
+        op.getOffsets(), [](Attribute attr) { return attr; });
+
+    if (!destDistributedDims.empty()) {
+      if (destDistributedDims.size() != 1)
+        return rewriter.notifyMatchFailure(
+            op, "only single dimension distribution is supported");
+      int64_t destDistDim = destDistributedDims[0];
+
+      VectorType srcType = op.getSourceVectorType();
+      // The distributed dim must be in the last k (source rank) dims of dest.
+      int64_t sourceDistDim =
+          destDistDim - (destType.getRank() - srcType.getRank());
+      if (sourceDistDim < 0)
+        return rewriter.notifyMatchFailure(
+            op, "distributed dimension must be in the last k dims of dest");
+
+      auto destLayout = xegpu::getTemporaryLayout(op->getOpOperand(1));
+      auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));
+      if (!destLayout || !sourceLayout ||
+          destLayout.getEffectiveLaneLayoutAsInt().empty() ||
+          sourceLayout.getEffectiveLaneLayoutAsInt().empty())
+        return rewriter.notifyMatchFailure(
+            op, "source or dest of insert_strided_slice lacks distribution "
+                "layout");
+
+      int subgroupSize = destLayout.getEffectiveLaneLayoutAsInt()[destDistDim];
+      auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
+      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
+      if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
+          !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
+        return rewriter.notifyMatchFailure(
+            op, "expecting unit lane data in source and dest layouts");
+
+      int64_t srcDistrDimSize = srcType.getDimSize(sourceDistDim);
+      if (srcDistrDimSize % subgroupSize != 0)
+        return rewriter.notifyMatchFailure(
+            op, "source distributed dim size is not a multiple of "
+                "subgroup size");
+
+      int64_t destDistrDimOffset =
+          cast<IntegerAttr>(op.getOffsets()[destDistDim]).getInt();
+      if (destDistrDimOffset % subgroupSize != 0)
+        return rewriter.notifyMatchFailure(
+            op, "offset along distributed dim is not a multiple of "
+                "subgroup size");
+      // Adjust offset for the distributed dimension.
+      updatedOffsets[destDistDim] =
+          rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
+    }
+
+    auto newOp = vector::InsertStridedSliceOp::create(
+        rewriter, op.getLoc(), distDestTy, adaptor.getValueToStore(),
+        adaptor.getDest(),
+        ArrayAttr::get(rewriter.getContext(), updatedOffsets), op.getStrides());
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
 /// Distributes a subgroup-level vector.insert op to workitem-level. Only
 /// handles sub-vector insertion (value to store is VectorType, not scalar).
 struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
@@ -792,12 +975,25 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       [=](vector::InsertOp op) -> bool {
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
       });
+  // vector::ExtractStridedSliceOp is legal only if its result has no temporary
+  // layout attribute.
+  target.addDynamicallyLegalOp<vector::ExtractStridedSliceOp>(
+      [=](vector::ExtractStridedSliceOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
+  // vector::InsertStridedSliceOp is legal only if its result has no temporary
+  // layout attribute.
+  target.addDynamicallyLegalOp<vector::InsertStridedSliceOp>(
+      [=](vector::InsertStridedSliceOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
                SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
                SgToWiVectorReduction, SgToWiMultiDimReduction,
-               SgToWiVectorExtract, SgToWiVectorInsert>(typeConverter,
-                                                        patterns.getContext());
+               SgToWiVectorExtract, SgToWiVectorInsert,
+               SgToWiVectorExtractStridedSlice, SgToWiVectorInsertStridedSlice>(
+      typeConverter, patterns.getContext());
 }
 
 void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1afa786bdfe52..81e4697959438 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -371,4 +371,245 @@ gpu.func @vector_insert_into_2d_idx2() {
     : vector<16xf32> into vector<8x16xf32>
   gpu.return
 }
+
+// extract_strided_slice: distributed dim fully extracted
+// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
+// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
+// Offsets [8,0] sizes [8,16] → [8,0] sizes [8,1]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<24x16xf32>
+  %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<24x16xf32> to vector<8x16xf32>
+  gpu.return
+}
+
+// extract_strided_slice: non-distributed (source already has unit dim)
+// Source: vector<24x1xf32> layout [1,16] → stays vector<24x1xf32>
+// Result: vector<8x1xf32> layout [1,16] → stays vector<8x1xf32>
+// Offsets and sizes unchanged
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_non_distributed() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<24x1xf32>
+  %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<24x1xf32> to vector<8x1xf32>
+  gpu.return
+}
+
+// extract_strided_slice: inner distributed (dim 1 distributed)
+// Source: vector<24x64xf32> layout [1,16] → distributed to vector<24x4xf32>
+// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
+// Offsets [8,48] → [8, 48/16=3], sizes [8,16] → [8,1]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_inner_distributed() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<24x64xf32>
+  %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<24x64xf32> to vector<8x16xf32>
+  gpu.return
+}
+
+// extract_strided_slice: outer distributed (dim 0 distributed)
+// Source: vector<32x16xf32> layout [16,1] → distributed to vector<2x16xf32>
+// Result: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
+// Offsets [16] padded to [16,0] → [16/16=1, 0], sizes [16] padded to [16,16] → [1,16]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
+gpu.func @vector_extract_strided_slice_outer_distributed() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : () -> vector<32x16xf32>
+  %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+    }
+    : vector<32x16xf32> to vector<16x16xf32>
+  gpu.return
+}
+
+// extract_strided_slice: 1D distributed
+// Source: vector<64xf32> layout [16] → distributed to vector<4xf32>
+// Result: vector<32xf32> layout [16] → distributed to vector<2xf32>
+// Offsets [16] → [16/16=1], sizes [32] → [2]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+gpu.func @vector_extract_strided_slice_1d() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : () -> vector<64xf32>
+  %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+    }
+    : vector<64xf32> to vector<32xf32>
+  gpu.return
+}
+
+// extract_strided_slice: partial offsets (offsets rank < source rank)
+// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
+// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
+// Offsets [8] padded to [8,0], sizes [8] padded to [8,16] → [8,1]
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets
+// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
+gpu.func @vector_extract_strided_slice_partial_offsets() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<24x16xf32>
+  %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<24x16xf32> to vector<8x16xf32>
+  gpu.return
+}
+
+// insert_strided_slice: distributed dim fully inserted (dim 1 distributed)
+// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
+// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
+// Offsets [24,0] → [24,0] (offset 0 / 16 = 0)
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
+gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<16x16xf32>
+  %1 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<64x16xf32>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16x16xf32> into vector<64x16xf32>
+  gpu.return
+}
+
+// insert_strided_slice: non-distributed (types already have unit dim)
+// Source: vector<16x1xf32> layout [1,16] → stays vector<16x1xf32>
+// Dest: vector<64x1xf32> layout [1,16] → stays vector<64x1xf32>
+// Offsets unchanged
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
+gpu.func @vector_insert_strided_slice_non_distributed() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<16x1xf32>
+  %1 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<64x1xf32>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16x1xf32> into vector<64x1xf32>
+  gpu.return
+}
+
+// insert_strided_slice: inner distributed (dim 1 distributed)
+// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
+// Dest: vector<64x32xf32> layout [1,16] → distributed to vector<64x2xf32>
+// Offsets [24,16] → [24, 16/16=1]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
+gpu.func @vector_insert_strided_slice_inner_distributed() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<16x16xf32>
+  %1 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<64x32xf32>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16x16xf32> into vector<64x32xf32>
+  gpu.return
+}
+
+// insert_strided_slice: outer distributed (dim 0 distributed)
+// Source: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
+// Dest: vector<48x32xf32> layout [16,1] → distributed to vector<3x32xf32>
+// Offsets [32,4] → [32/16=2, 4]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
+gpu.func @vector_insert_strided_slice_outer_distributed() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : () -> vector<16x16xf32>
+  %1 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : () -> vector<48x32xf32>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+    }
+    : vector<16x16xf32> into vector<48x32xf32>
+  gpu.return
+}
+
+// insert_strided_slice: 1D distributed
+// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
+// Dest: vector<48xf32> layout [16] → distributed to vector<3xf32>
+// Offsets [16] → [16/16=1]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
+gpu.func @vector_insert_strided_slice_1d() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : () -> vector<16xf32>
+  %1 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : () -> vector<48xf32>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+      layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+    }
+    : vector<16xf32> into vector<48xf32>
+  gpu.return
+}
+
+// insert_strided_slice: different ranks (1D source into 2D dest)
+// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
+// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
+// Distributed dim 1, sourceDistDim = 1 - (2-1) = 0
+// Offsets [13,0] → [13, 0/16=0]
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks
+// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
+gpu.func @vector_insert_strided_slice_different_ranks() {
+  %0 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : () -> vector<16xf32>
+  %1 = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : () -> vector<64x16xf32>
+  %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1],
+      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16xf32> into vector<64x16xf32>
+  gpu.return
+}
 }

>From ec7032a2dcc246ae88e326a13490a07d5cb79454 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 4 Mar 2026 19:02:07 +0000
Subject: [PATCH 3/4] Clean up

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 18 ++-----
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 49 -------------------
 2 files changed, 4 insertions(+), 63 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 29c1e08427fd2..29f9e96df6c76 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -723,8 +723,7 @@ struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
 
 /// Distributes a subgroup-level vector.extract_strided_slice op to
 /// workitem-level. If the result is distributed, the offsets and sizes are
-/// adjusted to match the distributed types. Supports both distributed and
-/// non-distributed cases.
+/// adjusted to match the distributed types.
 struct SgToWiVectorExtractStridedSlice
     : public OpConversionPattern<vector::ExtractStridedSliceOp> {
   using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
@@ -741,7 +740,7 @@ struct SgToWiVectorExtractStridedSlice
     auto distResultTyOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);
     // If distribution fails (e.g., dimension smaller than lane layout),
-    // the type stays unchanged (same behavior as TypeConverter).
+    // the type stays unchanged.
     VectorType distResultTy =
         succeeded(distResultTyOrFailure) ? *distResultTyOrFailure : resultType;
 
@@ -815,8 +814,7 @@ struct SgToWiVectorExtractStridedSlice
 
 /// Distributes a subgroup-level vector.insert_strided_slice op to
 /// workitem-level. If the dest is distributed, the offsets are adjusted to
-/// match the distributed types. Supports both distributed and non-distributed
-/// cases.
+/// match the distributed types.
 struct SgToWiVectorInsertStridedSlice
     : public OpConversionPattern<vector::InsertStridedSliceOp> {
   using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;
@@ -833,7 +831,7 @@ struct SgToWiVectorInsertStridedSlice
     auto distDestTyOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
     // If distribution fails (e.g., dimension smaller than lane layout),
-    // the type stays unchanged (same behavior as TypeConverter).
+    // the type stays unchanged.
     VectorType distDestTy =
         succeeded(distDestTyOrFailure) ? *distDestTyOrFailure : destType;
 
@@ -1131,28 +1129,20 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       [=](vector::MultiDimReductionOp op) -> bool {
         return !isValidSubgroupMultiReductionOp(op);
       });
-  // vector::ExtractOp is legal only if its result has no temporary layout
-  // attribute. Scalar extraction is always legal.
   target.addDynamicallyLegalOp<vector::ExtractOp>(
       [=](vector::ExtractOp op) -> bool {
         if (!isa<VectorType>(op.getType()))
           return true;
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
       });
-  // vector::InsertOp is legal only if its result has no temporary layout
-  // attribute.
   target.addDynamicallyLegalOp<vector::InsertOp>(
       [=](vector::InsertOp op) -> bool {
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
       });
-  // vector::ExtractStridedSliceOp is legal only if its result has no temporary
-  // layout attribute.
   target.addDynamicallyLegalOp<vector::ExtractStridedSliceOp>(
       [=](vector::ExtractStridedSliceOp op) -> bool {
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
       });
-  // vector::InsertStridedSliceOp is legal only if its result has no temporary
-  // layout attribute.
   target.addDynamicallyLegalOp<vector::InsertStridedSliceOp>(
       [=](vector::InsertStridedSliceOp op) -> bool {
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index af6827680145c..520edd4b98ee1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -515,10 +515,6 @@ gpu.func @vector_insert_into_2d_idx2() {
   gpu.return
 }
 
-// extract_strided_slice: distributed dim fully extracted
-// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
-// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
-// Offsets [8,0] sizes [8,16] → [8,0] sizes [8,1]
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
@@ -533,10 +529,6 @@ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
   gpu.return
 }
 
-// extract_strided_slice: non-distributed (source already has unit dim)
-// Source: vector<24x1xf32> layout [1,16] → stays vector<24x1xf32>
-// Result: vector<8x1xf32> layout [1,16] → stays vector<8x1xf32>
-// Offsets and sizes unchanged
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_non_distributed() {
@@ -551,10 +543,6 @@ gpu.func @vector_extract_strided_slice_non_distributed() {
   gpu.return
 }
 
-// extract_strided_slice: inner distributed (dim 1 distributed)
-// Source: vector<24x64xf32> layout [1,16] → distributed to vector<24x4xf32>
-// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
-// Offsets [8,48] → [8, 48/16=3], sizes [8,16] → [8,1]
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_inner_distributed() {
@@ -569,10 +557,6 @@ gpu.func @vector_extract_strided_slice_inner_distributed() {
   gpu.return
 }
 
-// extract_strided_slice: outer distributed (dim 0 distributed)
-// Source: vector<32x16xf32> layout [16,1] → distributed to vector<2x16xf32>
-// Result: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
-// Offsets [16] padded to [16,0] → [16/16=1, 0], sizes [16] padded to [16,16] → [1,16]
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1, 0], sizes = [1, 16], strides = [1, 1]} : vector<2x16xf32> to vector<1x16xf32>
 gpu.func @vector_extract_strided_slice_outer_distributed() {
@@ -587,10 +571,6 @@ gpu.func @vector_extract_strided_slice_outer_distributed() {
   gpu.return
 }
 
-// extract_strided_slice: 1D distributed
-// Source: vector<64xf32> layout [16] → distributed to vector<4xf32>
-// Result: vector<32xf32> layout [16] → distributed to vector<2xf32>
-// Offsets [16] → [16/16=1], sizes [32] → [2]
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
 gpu.func @vector_extract_strided_slice_1d() {
@@ -605,10 +585,6 @@ gpu.func @vector_extract_strided_slice_1d() {
   gpu.return
 }
 
-// extract_strided_slice: partial offsets (offsets rank < source rank)
-// Source: vector<24x16xf32> layout [1,16] → distributed to vector<24x1xf32>
-// Result: vector<8x16xf32> layout [1,16] → distributed to vector<8x1xf32>
-// Offsets [8] padded to [8,0], sizes [8] padded to [8,16] → [8,1]
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_partial_offsets() {
@@ -623,10 +599,6 @@ gpu.func @vector_extract_strided_slice_partial_offsets() {
   gpu.return
 }
 
-// insert_strided_slice: distributed dim fully inserted (dim 1 distributed)
-// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
-// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
-// Offsets [24,0] → [24,0] (offset 0 / 16 = 0)
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
 gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
@@ -645,10 +617,6 @@ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
   gpu.return
 }
 
-// insert_strided_slice: non-distributed (types already have unit dim)
-// Source: vector<16x1xf32> layout [1,16] → stays vector<16x1xf32>
-// Dest: vector<64x1xf32> layout [1,16] → stays vector<64x1xf32>
-// Offsets unchanged
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
 gpu.func @vector_insert_strided_slice_non_distributed() {
@@ -667,10 +635,6 @@ gpu.func @vector_insert_strided_slice_non_distributed() {
   gpu.return
 }
 
-// insert_strided_slice: inner distributed (dim 1 distributed)
-// Source: vector<16x16xf32> layout [1,16] → distributed to vector<16x1xf32>
-// Dest: vector<64x32xf32> layout [1,16] → distributed to vector<64x2xf32>
-// Offsets [24,16] → [24, 16/16=1]
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
 gpu.func @vector_insert_strided_slice_inner_distributed() {
@@ -689,10 +653,6 @@ gpu.func @vector_insert_strided_slice_inner_distributed() {
   gpu.return
 }
 
-// insert_strided_slice: outer distributed (dim 0 distributed)
-// Source: vector<16x16xf32> layout [16,1] → distributed to vector<1x16xf32>
-// Dest: vector<48x32xf32> layout [16,1] → distributed to vector<3x32xf32>
-// Offsets [32,4] → [32/16=2, 4]
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
 gpu.func @vector_insert_strided_slice_outer_distributed() {
@@ -711,10 +671,6 @@ gpu.func @vector_insert_strided_slice_outer_distributed() {
   gpu.return
 }
 
-// insert_strided_slice: 1D distributed
-// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
-// Dest: vector<48xf32> layout [16] → distributed to vector<3xf32>
-// Offsets [16] → [16/16=1]
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
 gpu.func @vector_insert_strided_slice_1d() {
@@ -733,11 +689,6 @@ gpu.func @vector_insert_strided_slice_1d() {
   gpu.return
 }
 
-// insert_strided_slice: different ranks (1D source into 2D dest)
-// Source: vector<16xf32> layout [16] → distributed to vector<1xf32>
-// Dest: vector<64x16xf32> layout [1,16] → distributed to vector<64x1xf32>
-// Distributed dim 1, sourceDistDim = 1 - (2-1) = 0
-// Offsets [13,0] → [13, 0/16=0]
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
 gpu.func @vector_insert_strided_slice_different_ranks() {

>From 9f2552d4d02f62c2f4fc0e130af068c3b592bba9 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 10 Mar 2026 18:36:00 +0000
Subject: [PATCH 4/4] Address feedback

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 65 +++++++++++--------
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 40 ++----------
 2 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 29f9e96df6c76..c1173e22b96a0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -121,6 +121,20 @@ static bool isReductionLaneLocal(vector::MultiDimReductionOp op) {
   return resTy != resDistTypeOrFailure.value();
 }
 
+/// Given a vector type and its distributed vector type, return the list of
+/// dimensions that are distributed.
+static SmallVector<int64_t> getDistributedDims(VectorType originalType,
+                                               VectorType distributedType) {
+  assert(originalType.getRank() == distributedType.getRank() &&
+         "original and distributed vector types must have the same rank");
+  SmallVector<int64_t> distributedDims;
+  for (int64_t i = 0; i < originalType.getRank(); ++i) {
+    if (distributedType.getDimSize(i) != originalType.getDimSize(i))
+      distributedDims.push_back(i);
+  }
+  return distributedDims;
+}
+
 /// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
 /// op. This simply drops the layout attribute from the tensor descriptor type.
 struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
@@ -739,18 +753,13 @@ struct SgToWiVectorExtractStridedSlice
     VectorType resultType = op.getType();
     auto distResultTyOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, resultType);
-    // If distribution fails (e.g., dimension smaller than lane layout),
-    // the type stays unchanged.
-    VectorType distResultTy =
-        succeeded(distResultTyOrFailure) ? *distResultTyOrFailure : resultType;
-
-    // Find distributed dimensions by comparing original and distributed
-    // result types.
-    SmallVector<int64_t> distributedDims;
-    for (int64_t i = 0; i < resultType.getRank(); ++i) {
-      if (distResultTy.getDimSize(i) != resultType.getDimSize(i))
-        distributedDims.push_back(i);
-    }
+    if (failed(distResultTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute distributed vector type from lane layout");
+    VectorType distResultTy = *distResultTyOrFailure;
+
+    SmallVector<int64_t> distributedDims =
+        getDistributedDims(resultType, distResultTy);
 
     // Collect updated sizes, offsets, strides. Pad to full source rank.
     int64_t sourceRank = op.getSourceVectorType().getRank();
@@ -774,12 +783,15 @@ struct SgToWiVectorExtractStridedSlice
         return rewriter.notifyMatchFailure(
             op, "only single dimension distribution is supported");
       int64_t distDim = distributedDims[0];
+      const uArch *uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+      if (!uArch)
+        return rewriter.notifyMatchFailure(
+            op, "target attribute required to determine subgroup size");
+      int subgroupSize = uArch->getSubgroupSize();
       auto sourceLayout = xegpu::getTemporaryLayout(op->getOpOperand(0));
       if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
         return rewriter.notifyMatchFailure(
             op, "source of extract_strided_slice lacks distribution layout");
-      auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
-      int subgroupSize = sourceLaneLayout[distDim];
       int sourceDistrDimSize = op.getSourceVectorType().getShape()[distDim];
       if (sourceDistrDimSize % subgroupSize != 0)
         return rewriter.notifyMatchFailure(
@@ -830,17 +842,13 @@ struct SgToWiVectorInsertStridedSlice
     VectorType destType = op.getDestVectorType();
     auto distDestTyOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
-    // If distribution fails (e.g., dimension smaller than lane layout),
-    // the type stays unchanged.
-    VectorType distDestTy =
-        succeeded(distDestTyOrFailure) ? *distDestTyOrFailure : destType;
-
-    // Find distributed dimensions of the dest vector.
-    SmallVector<int64_t> destDistributedDims;
-    for (int64_t i = 0; i < destType.getRank(); ++i) {
-      if (distDestTy.getDimSize(i) != destType.getDimSize(i))
-        destDistributedDims.push_back(i);
-    }
+    if (failed(distDestTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute distributed vector type from lane layout");
+    VectorType distDestTy = *distDestTyOrFailure;
+
+    SmallVector<int64_t> destDistributedDims =
+        getDistributedDims(destType, distDestTy);
 
     SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
         op.getOffsets(), [](Attribute attr) { return attr; });
@@ -851,6 +859,12 @@ struct SgToWiVectorInsertStridedSlice
             op, "only single dimension distribution is supported");
       int64_t destDistDim = destDistributedDims[0];
 
+      const uArch *uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+      if (!uArch)
+        return rewriter.notifyMatchFailure(
+            op, "target attribute required to determine subgroup size");
+      int subgroupSize = uArch->getSubgroupSize();
+
       VectorType srcType = op.getSourceVectorType();
       // The distributed dim must be in the last k (source rank) dims of dest.
       int64_t sourceDistDim =
@@ -868,7 +882,6 @@ struct SgToWiVectorInsertStridedSlice
             op, "source or dest of insert_strided_slice lacks distribution "
                 "layout");
 
-      int subgroupSize = destLayout.getEffectiveLaneLayoutAsInt()[destDistDim];
       auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
       auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
       if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 520edd4b98ee1..c2c97c5f32a7c 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -473,9 +473,9 @@ gpu.func @vector_extract_from_2d() {
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_extract_from_2d_idx2
+// CHECK-LABEL: gpu.func @vector_extract_from_2d_offset2
 // CHECK: %[[EXT:.*]] = vector.extract %{{.*}}[2] : vector<1xf32> from vector<8x1xf32>
-gpu.func @vector_extract_from_2d_idx2() {
+gpu.func @vector_extract_from_2d_offset2() {
   %src = "some_op"()
     {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     : () -> vector<8x16xf32>
@@ -500,9 +500,9 @@ gpu.func @vector_insert_into_2d() {
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_insert_into_2d_idx2
+// CHECK-LABEL: gpu.func @vector_insert_into_2d_offset2
 // CHECK: %[[INS:.*]] = vector.insert %{{.*}}, %{{.*}}[2] : vector<1xf32> into vector<8x1xf32>
-gpu.func @vector_insert_into_2d_idx2() {
+gpu.func @vector_insert_into_2d_offset2() {
   %val = "some_op"()
     {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
     : () -> vector<16xf32>
@@ -529,20 +529,6 @@ gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted() {
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
-// CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-gpu.func @vector_extract_strided_slice_non_distributed() {
-  %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : () -> vector<24x1xf32>
-  %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-    : vector<24x1xf32> to vector<8x1xf32>
-  gpu.return
-}
-
 // CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
 // CHECK: %[[ESS:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
 gpu.func @vector_extract_strided_slice_inner_distributed() {
@@ -617,24 +603,6 @@ gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted() {
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
-// CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
-gpu.func @vector_insert_strided_slice_non_distributed() {
-  %0 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : () -> vector<16x1xf32>
-  %1 = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : () -> vector<64x1xf32>
-  %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-    : vector<16x1xf32> into vector<64x1xf32>
-  gpu.return
-}
-
 // CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
 // CHECK: %[[ISS:.*]] = vector.insert_strided_slice %{{.*}}, %{{.*}} {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
 gpu.func @vector_insert_strided_slice_inner_distributed() {