[Mlir-commits] [mlir] fe4c2bb - [mlir][xegpu] Deprecate XeGPUSubgroupDistribute and rename XeGPUSgToWiDistributeExperimental to XeGPUSgToLaneDistribute (#198027)

Tue May 26 07:52:52 PDT 2026

Author: Nishant Patel
Date: 2026-05-26T07:52:46-07:00
New Revision: fe4c2bb1b3d5626d7de8112e815b8548b20a81fe

URL: https://github.com/llvm/llvm-project/commit/fe4c2bb1b3d5626d7de8112e815b8548b20a81fe
DIFF: https://github.com/llvm/llvm-project/commit/fe4c2bb1b3d5626d7de8112e815b8548b20a81fe.diff

LOG: [mlir][xegpu] Deprecate XeGPUSubgroupDistribute and rename XeGPUSgToWiDistributeExperimental to XeGPUSgToLaneDistribute (#198027)

The XeGPUSubgroupDistribute pass is now fully superseded by the newer
subgroup-to-lane distribution flow, so this PR removes its
implementation & all associated tests.
The replacement pass XeGPUSgToWiDistributeExperimental is renamed to
XeGPUSgToLaneDistribute.

Added: 
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
    mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
    mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir

Modified: 
    mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
    mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
    mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
    mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
    mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
    mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp

Removed: 
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
    mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
    mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
    mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
    mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
    mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
    mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
    mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir


################################################################################
diff  --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 4bee1752b271e..90aa15f75f55f 100644

--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -11,15 +11,6 @@
 
 include "mlir/Pass/PassBase.td"
 
-def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
-  let summary = "Distribute XeGPU ops to work items";
-  let description = [{
-    The pass distributes subgroup level (SIMD) XeGPU ops to work items.
-  }];
-  let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
-                           "vector::VectorDialect", "index::IndexDialect"];
-}
-
 def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
   let summary = "Propagate and assign XeGPU layout information";
   let description = [{
@@ -109,10 +100,10 @@ def XeGPUPeepHoleOptimizer : Pass<"xegpu-optimize-peephole"> {
                            "vector::VectorDialect"];
 }
 
-def XeGPUSgToWiDistributeExperimental : Pass<"xegpu-sg-to-wi-distribute-experimental"> {
-  let summary = "Distribute XeGPU ops to work items";
+def XeGPUSgToLaneDistribute : Pass<"xegpu-sg-to-lane-distribute"> {
+  let summary = "Distribute XeGPU ops to lanes";
   let description = [{
-    The pass distributes subgroup level XeGPU ops to work item level XeGPU ops.
+    The pass distributes subgroup level XeGPU ops to lane level XeGPU ops.
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
                            "vector::VectorDialect", "index::IndexDialect"];

diff  --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index a21866b5cc33f..919a69908bdce 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -65,21 +65,18 @@ struct UnrollOptions {
 void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns);
 /// Appends patterns for array length optimization into `patterns`.
 void populateXeGPUArrayLengthOptimizationPatterns(RewritePatternSet &patterns);
-/// Appends patterns for XeGPU SIMT distribution into `patterns`.
-void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
-/// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
-void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU workgroup to subgroup distribution into
 /// `patterns`.
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
-/// Define only the type conversions needed for XeGPU subgroup to workitem
+/// Define only the type conversions needed for XeGPU subgroup to lane
 /// distribution.
-void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
-/// Defines type conversions and legality for XeGPU subgroup to workitem
+void populateXeGPUSgToLaneDistributeTypeConversions(
+    TypeConverter &typeConverter);
+/// Defines type conversions and legality for XeGPU subgroup to lane
 /// distribution and appends the required conversion patterns into `patterns`.
-/// Appends patterns for XeGPU subgroup to workitem distribution into
+/// Appends patterns for XeGPU subgroup to lane distribution into
 /// `patterns`.
-void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+void populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target);
 

diff  --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index ec5591aa85613..fa0ab712fa64d 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -91,8 +91,7 @@ void buildGPUPassPipeline(OpPassManager &pm,
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
     pm.addNestedPass<gpu::GPUModuleOp>(
         xegpu::createXeGPUPropagateLayout(laneLayoutOptions));
-    pm.addNestedPass<gpu::GPUModuleOp>(
-        xegpu::createXeGPUSgToWiDistributeExperimental());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSgToLaneDistribute());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass());

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 0e30a6ee6e3f0..37922f7ef7d24 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,8 +1,7 @@
 add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUArrayLengthOptimization.cpp
   XeGPUBlocking.cpp
-  XeGPUSgToWiDistributeExperimental.cpp
-  XeGPUSubgroupDistribute.cpp
+  XeGPUSgToLaneDistribute.cpp
   XeGPUUnroll.cpp
   XeGPUWgToSgDistribute.cpp
   XeGPUPropagateLayout.cpp

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
similarity index 90%
rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
index 2af5429395526..8a926754e7cfb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
@@ -1,4 +1,4 @@
-//===- XeGPUSgToWiDistributeExperimental.cpp - XeGPU SG to WI Pass --------===//
+//===- XeGPUSgToLaneDistribute.cpp - XeGPU SG to Lane Pass ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -33,14 +33,14 @@
 
 namespace mlir {
 namespace xegpu {
-#define GEN_PASS_DEF_XEGPUSGTOWIDISTRIBUTEEXPERIMENTAL
+#define GEN_PASS_DEF_XEGPUSGTOLANEDISTRIBUTE
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
 } // namespace xegpu
 } // namespace mlir
 
 using namespace mlir;
 
-#define DEBUG_TYPE "xegpu-sg-to-wi-distribute-experimental"
+#define DEBUG_TYPE "xegpu-sg-to-lane-distribute"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 
 namespace {
@@ -84,7 +84,7 @@ static bool isValidSubgroupMultiReductionOp(vector::MultiDimReductionOp op) {
   return op.getReductionDims().size() == 1;
 }
 
-/// A vector::MultiDimReductionOp is doing lane-local reduction if each workitem
+/// A vector::MultiDimReductionOp is doing lane-local reduction if each lane
 /// is doing its own local reduction. In this case the result layout ensures
 /// that result vector is distributed to lanes, i.e. the result vector type is
 /// 
diff erent from the distributed result vector type.
@@ -112,9 +112,10 @@ static SmallVector<int64_t> getDistributedDims(VectorType originalType,
   return distributedDims;
 }
 
-/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
+/// Distributes a subgroup-level CreateNdDesc op to lane-level CreateNdDesc
 /// op. This simply drops the layout attribute from the tensor descriptor type.
-struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
+struct SgToLaneCreateNdDesc
+    : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
 
   LogicalResult
@@ -133,10 +134,10 @@ struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
   }
 };
 
-/// Distributes a subgroup-level LoadNd op to workitem-level LoadNd op. Output
-/// of workitem-level LoadNd op is 1D. ShapeCast is added to restore the
+/// Distributes a subgroup-level LoadNd op to lane-level LoadNd op. Output
+/// of lane-level LoadNd op is 1D. ShapeCast is added to restore the
 /// original rank.
-struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
+struct SgToLaneLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
   using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
 
   LogicalResult
@@ -157,19 +158,18 @@ struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
           op, "xegpu::LoadNdOp require target attribute attached to "
               "determine transpose "
               "requirement");
-    auto supportedWiResultTyOrFailure =
+    auto supportedLaneResultTyOrFailure =
         xegpu::getDistributedVectorType(op.getTensorDescType());
-    auto expectedWiResultTyOrFailure =
+    auto expectedLaneResultTyOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(layout, op.getType());
-    if (failed(supportedWiResultTyOrFailure))
+    if (failed(supportedLaneResultTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute the workitem vector type for LoadNdOp");
-    if (failed(expectedWiResultTyOrFailure))
+          op, "unable to compute the lane vector type for LoadNdOp");
+    if (failed(expectedLaneResultTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op,
-          "unable to compute expected workitem vector type from lane layout");
+          op, "unable to compute expected lane vector type from lane layout");
     auto newOp = xegpu::LoadNdOp::create(
-        rewriter, op.getLoc(), supportedWiResultTyOrFailure.value(),
+        rewriter, op.getLoc(), supportedLaneResultTyOrFailure.value(),
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
         op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
         op.getL3HintAttr(), /**layout**/ nullptr);
@@ -179,15 +179,15 @@ struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
     if (xegpu::requireTranspose(cast<xegpu::LayoutAttr>(layout), uArch))
       newOp.setTranspose(DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
-                                       expectedWiResultTyOrFailure.value()));
+                                       expectedLaneResultTyOrFailure.value()));
     return success();
   }
 };
 
-/// Distributes a subgroup-level StoreNd op to workitem-level StoreNd op. Stored
-/// value in workitem-level StoreNd op is 1D. ShapeCast is added to cast the
+/// Distributes a subgroup-level StoreNd op to lane-level StoreNd op. Stored
+/// value in lane-level StoreNd op is 1D. ShapeCast is added to cast the
 /// incoming value to 1D.
-struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
+struct SgToLaneStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
   using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
 
   LogicalResult
@@ -206,18 +206,18 @@ struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
     if (valueLayout != layout)
       return rewriter.notifyMatchFailure(
           op, "conflicting layout attributes on value and anchor");
-    auto supportedWiValueTyOrFailure =
+    auto supportedLaneValueTyOrFailure =
         xegpu::getDistributedVectorType(op.getTensorDescType());
-    if (failed(supportedWiValueTyOrFailure))
+    if (failed(supportedLaneValueTyOrFailure))
       return rewriter.notifyMatchFailure(
           op,
-          "unable to compute wi vector type for StoreNdOp value from tensor "
+          "unable to compute lane vector type for StoreNdOp value from tensor "
           "descriptor");
 
     xegpu::StoreNdOp::create(
         rewriter, op.getLoc(),
         castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
-                    supportedWiValueTyOrFailure.value()),
+                    supportedLaneValueTyOrFailure.value()),
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
         op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
     rewriter.eraseOp(op);
@@ -225,10 +225,10 @@ struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
   }
 };
 
-/// Distributes a subgroup-level Dpas op to workitem-level Dpas op. All inpputs
-/// and output of workitem-level Dpas op are 1D. Necessary casts are added to
+/// Distributes a subgroup-level Dpas op to lane-level Dpas op. All inpputs
+/// and output of lane-level Dpas op are 1D. Necessary casts are added to
 /// convert the inputs and output to/from 1D.
-struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
+struct SgToLaneDpas : public OpConversionPattern<xegpu::DpasOp> {
   using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
 
   LogicalResult
@@ -240,22 +240,22 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
     auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
     if (!layoutA || !layoutB || !layoutCd)
       return failure();
-    auto wiResultTyOrFailure =
+    auto laneResultTyOrFailure =
         xegpu::getDistributedVectorType(op.getType(), layoutCd);
-    auto wiATypeOrFailure =
+    auto laneATypeOrFailure =
         xegpu::getDistributedVectorType(op.getLhs().getType(), layoutA);
-    auto wiBTypeOrFailure =
+    auto laneBTypeOrFailure =
         xegpu::getDistributedVectorType(op.getRhs().getType(), layoutB);
-    auto expectedWiResultTyOrFailure =
+    auto expectedLaneResultTyOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());
-    if (failed(wiResultTyOrFailure) || failed(wiATypeOrFailure) ||
-        failed(wiBTypeOrFailure))
+    if (failed(laneResultTyOrFailure) || failed(laneATypeOrFailure) ||
+        failed(laneBTypeOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "failed to calculate supported workitem vector types for DpasOp "
+          op, "failed to calculate supported lane vector types for DpasOp "
               "from layouts");
-    if (failed(expectedWiResultTyOrFailure))
+    if (failed(expectedLaneResultTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute expected workitem vector type for DpasOp from "
+          op, "unable to compute expected lane vector type for DpasOp from "
               "lane layout");
 
     // Validate bit widths match uArch packed format requirements
@@ -266,13 +266,13 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
               uArch->getInstruction(
                   xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
       if (uArchInstruction) {
-        auto wiAType = wiATypeOrFailure.value();
-        auto wiBType = wiBTypeOrFailure.value();
+        auto laneAType = laneATypeOrFailure.value();
+        auto laneBType = laneBTypeOrFailure.value();
         // Calculate total packed bit width = element bit width * vector size
         unsigned aPackedBitWidth =
-            wiAType.getElementTypeBitWidth() * wiAType.getNumElements();
+            laneAType.getElementTypeBitWidth() * laneAType.getNumElements();
         unsigned bPackedBitWidth =
-            wiBType.getElementTypeBitWidth() * wiBType.getNumElements();
+            laneBType.getElementTypeBitWidth() * laneBType.getNumElements();
         unsigned expectedABitSize = uArchInstruction->getPackedFormatBitSizeA();
         unsigned expectedBBitSize = uArchInstruction->getPackedFormatBitSizeB();
 
@@ -290,26 +290,26 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
     }
 
     auto newOp = xegpu::DpasOp::create(
-        rewriter, op->getLoc(), wiResultTyOrFailure.value(),
+        rewriter, op->getLoc(), laneResultTyOrFailure.value(),
         castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
-                    wiATypeOrFailure.value()),
+                    laneATypeOrFailure.value()),
         castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
-                    wiBTypeOrFailure.value()),
+                    laneBTypeOrFailure.value()),
         castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
-                    wiResultTyOrFailure.value()),
+                    laneResultTyOrFailure.value()),
         /** layoutA**/ nullptr,
         /** layoutB**/ nullptr, /** layoutCd**/ nullptr);
     // Explicitly set the new types to enable correct type materializations.
     rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
-                                       expectedWiResultTyOrFailure.value()));
+                                       expectedLaneResultTyOrFailure.value()));
     return success();
   }
 };
 
-/// Distributes elementwise ops to workitem-level elementwise ops. This
+/// Distributes elementwise ops to lane-level elementwise ops. This
 /// currently handles elementwise ops with single result only.
-struct SgToWiElementWise : public ConversionPattern {
-  SgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
+struct SgToLaneElementWise : public ConversionPattern {
+  SgToLaneElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
       : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
 
   LogicalResult
@@ -330,14 +330,14 @@ struct SgToWiElementWise : public ConversionPattern {
       return rewriter.notifyMatchFailure(
           op, "operation result does not have subgroup distribute layout");
 
-    auto wiShapeOrFailure =
+    auto laneShapeOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
 
-    if (failed(wiShapeOrFailure))
+    if (failed(laneShapeOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute workitem vector type from the layout");
+          op, "unable to compute lane vector type from the layout");
 
-    VectorType newResultType = wiShapeOrFailure.value();
+    VectorType newResultType = laneShapeOrFailure.value();
     OperationState state(op->getLoc(), op->getName());
     state.addOperands(operands);
     state.addTypes(newResultType);
@@ -353,9 +353,9 @@ struct SgToWiElementWise : public ConversionPattern {
   }
 };
 
-/// Distributes a subgroup-level arith ConstantOp to workitem-level arith
+/// Distributes a subgroup-level arith ConstantOp to lane-level arith
 /// ConstantOp.
-struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
+struct SgToLaneArithConstant : public OpConversionPattern<arith::ConstantOp> {
   using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
 
   LogicalResult
@@ -377,14 +377,14 @@ struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
       return rewriter.notifyMatchFailure(
           op, "operation result does not have subgroup distribute layout");
 
-    auto wiShapeOrFailure =
+    auto laneShapeOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
 
-    if (failed(wiShapeOrFailure))
+    if (failed(laneShapeOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute workitem vector type from the layout");
+          op, "unable to compute lane vector type from the layout");
 
-    VectorType newResultType = wiShapeOrFailure.value();
+    VectorType newResultType = laneShapeOrFailure.value();
     auto sclarValue = dense.getSplatValue<Attribute>();
     auto newDenseAttr = DenseElementsAttr::get(newResultType, sclarValue);
 
@@ -395,8 +395,8 @@ struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
   }
 };
 
-/// Distributes a subgroup-level PrefetchNd op to workitem-level PrefetchNd op.
-struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
+/// Distributes a subgroup-level PrefetchNd op to lane-level PrefetchNd op.
+struct SgToLanePrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
   using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
 
   LogicalResult
@@ -416,7 +416,7 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
   }
 };
 
-/// Distributes a subgroup-level LoadGather (xegpu.load) op to workitem-level.
+/// Distributes a subgroup-level LoadGather (xegpu.load) op to lane-level.
 ///
 /// Example 1 (1D, no chunk size):
 ///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
@@ -449,7 +449,7 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
 ///   %offset = producer_op : vector<1x1x1xindex>
 ///   %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,
 ///     vector<1xindex>, vector<1xi1> -> vector<1xf16>
-struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
+struct SgToLaneLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
   using OpConversionPattern<xegpu::LoadGatherOp>::OpConversionPattern;
 
   LogicalResult
@@ -478,8 +478,7 @@ struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
         xegpu::getDistVecTypeBasedOnLaneLayout(layout, origResultTy);
     if (failed(distResultTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op,
-          "unable to compute expected workitem vector type from lane layout");
+          op, "unable to compute expected lane vector type from lane layout");
 
     VectorType distResultTy = distResultTyOrFailure.value();
     VectorType distResultTy1D = VectorType::get({distResultTy.getNumElements()},
@@ -516,10 +515,11 @@ struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
 };
 
 /// This pattern distributes a subgroup-level vector.reduction op to
-/// workitem-level. This require shuffling the data across the workitems (using
-/// gpu::ShuffleOp) and reducing in stages until all workitems have the final
+/// lane-level. This require shuffling the data across the lanes (using
+/// gpu::ShuffleOp) and reducing in stages until all lanes have the final
 /// result.
-struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
+struct SgToLaneVectorReduction
+    : public OpConversionPattern<vector::ReductionOp> {
   using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;
 
   LogicalResult
@@ -561,10 +561,10 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
           op, "Reduction distribution currently only supports floats and "
               "integer types.");
 
-    // Get the distributed vector (per work-item portion).
+    // Get the distributed vector (per lane portion).
     Value laneValVec = adaptor.getVector();
 
-    // Distribute and reduce across work-items in the subgroup.
+    // Distribute and reduce across lanes in the subgroup.
     Value fullReduce = xegpu::subgroupReduction(
         op.getLoc(), rewriter, laneValVec, op.getKind(), sgSize);
 
@@ -579,10 +579,10 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
 };
 
 /// This pattern distributes a subgroup-level vector.multi_reduction op to
-/// workitem-level only if the reduction is lane-local. This means that
+/// lane-level only if the reduction is lane-local. This means that
 /// reduction dimension is not distributed to lanes and each lane does its own
 /// local reduction.
-struct SgToWiMultiDimReduction
+struct SgToLaneMultiDimReduction
     : public OpConversionPattern<vector::MultiDimReductionOp> {
   using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
 
@@ -643,7 +643,7 @@ struct SgToWiMultiDimReduction
 };
 
 /// Helper to compute distributed coordinates for matrix ops.
-/// When not using subgroup_block_io, each workitem computes its own
+/// When not using subgroup_block_io, each lane computes its own
 /// coordinates based on the layout and lane ID.
 static SmallVector<Value> computeDistributedCoordsForMatrixOp(
     ConversionPatternRewriter &rewriter, Location loc,
@@ -663,8 +663,8 @@ static SmallVector<Value> computeDistributedCoordsForMatrixOp(
   return llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
 }
 
-/// This pattern distributes a subgroup-level LoadMatrix op to workitem-level.
-struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
+/// This pattern distributes a subgroup-level LoadMatrix op to lane-level.
+struct SgToLaneLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
   using OpConversionPattern<xegpu::LoadMatrixOp>::OpConversionPattern;
 
   LogicalResult
@@ -717,8 +717,9 @@ struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
   }
 };
 
-/// Distributes a subgroup-level vector.transpose op to workitem-level.
-struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
+/// Distributes a subgroup-level vector.transpose op to lane-level.
+struct SgToLaneVectorTranspose
+    : public OpConversionPattern<vector::TransposeOp> {
   using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
 
   LogicalResult
@@ -753,9 +754,9 @@ struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
   }
 };
 
-/// Distributes a subgroup-level vector.bitcast op to workitem-level.
+/// Distributes a subgroup-level vector.bitcast op to lane-level.
 /// Bitcast only impacts the innermost dimension of the source/result vectors.
-struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
+struct SgToLaneVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
   using OpConversionPattern<vector::BitCastOp>::OpConversionPattern;
 
   LogicalResult
@@ -781,8 +782,8 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
 };
 
 /// Distributes a subgroup-level vector.create_mask or vector.constant_mask op
-/// to workitem-level. Uses `computeDistributedCoords()` to obtain the
-/// coordinates each workitem owns, then compares each coordinate against the
+/// to lane-level. Uses `computeDistributedCoords()` to obtain the
+/// coordinates each lane owns, then compares each coordinate against the
 /// original mask bounds using `arith.cmpi slt`. The per-element boolean
 /// results are assembled into the distributed mask vector.
 ///
@@ -806,7 +807,7 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
 template <typename OpType,
           typename = std::enable_if_t<llvm::is_one_of<
               OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>
-struct SgToWiCreateMask : public OpConversionPattern<OpType> {
+struct SgToLaneCreateMask : public OpConversionPattern<OpType> {
   using OpConversionPattern<OpType>::OpConversionPattern;
 
   LogicalResult
@@ -823,7 +824,7 @@ struct SgToWiCreateMask : public OpConversionPattern<OpType> {
         getDistVecTypeBasedOnLaneLayout(layout, origType);
     if (failed(distTypeOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute workitem vector type from the layout");
+          op, "unable to compute lane vector type from the layout");
 
     VectorType distType = distTypeOrFailure.value();
     Location loc = op.getLoc();
@@ -884,8 +885,8 @@ struct SgToWiCreateMask : public OpConversionPattern<OpType> {
   }
 };
 
-/// This pattern distributes a subgroup-level StoreMatrix op to workitem-level.
-struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
+/// This pattern distributes a subgroup-level StoreMatrix op to lane-level.
+struct SgToLaneStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
   using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
 
   LogicalResult
@@ -941,7 +942,7 @@ struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
 };
 
 /// Distributes a subgroup-level StoreScatter (xegpu.store) op to
-/// workitem-level.
+/// lane-level.
 ///
 /// Example 1 (1D, no chunk size):
 ///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
@@ -974,7 +975,8 @@ struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
 ///   %offset = producer_op : vector<1x1x1xindex>
 ///   xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
 ///     memref<256xf16>, vector<1xindex>, vector<1xi1>
-struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
+struct SgToLaneStoreScatter
+    : public OpConversionPattern<xegpu::StoreScatterOp> {
   using OpConversionPattern<xegpu::StoreScatterOp>::OpConversionPattern;
 
   LogicalResult
@@ -1002,8 +1004,7 @@ struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
         xegpu::getDistVecTypeBasedOnLaneLayout(layout, origValueTy);
     if (failed(distValueTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op,
-          "unable to compute expected workitem vector type from lane layout");
+          op, "unable to compute expected lane vector type from lane layout");
 
     VectorType distValueTy = distValueTyOrFailure.value();
     VectorType distValueTy1D = VectorType::get({distValueTy.getNumElements()},
@@ -1039,11 +1040,11 @@ struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
   }
 };
 
-/// Distribute a vector::StepOp to workitem-level.
+/// Distribute a vector::StepOp to lane-level.
 /// The layout must have exactly 1 effective lane dimension.
 /// We completely resolve the vector::StepOp by computing the lane_data-sized
 /// subranges.
-struct SgToWiVectorStep : public OpConversionPattern<vector::StepOp> {
+struct SgToLaneVectorStep : public OpConversionPattern<vector::StepOp> {
   using OpConversionPattern<vector::StepOp>::OpConversionPattern;
 
   LogicalResult
@@ -1057,12 +1058,12 @@ struct SgToWiVectorStep : public OpConversionPattern<vector::StepOp> {
 
     auto loc = op.getLoc();
     auto stepResultVecTy = op.getResult().getType();
-    auto wiShapeOrFailure =
+    auto laneShapeOrFailure =
         xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, stepResultVecTy);
-    if (failed(wiShapeOrFailure))
+    if (failed(laneShapeOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute workitem vector type from the layout");
-    VectorType newVecTy = wiShapeOrFailure.value();
+          op, "unable to compute lane vector type from the layout");
+    VectorType newVecTy = laneShapeOrFailure.value();
 
     Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
                                          /*upperBound=*/mlir::IntegerAttr());
@@ -1103,9 +1104,9 @@ struct SgToWiVectorStep : public OpConversionPattern<vector::StepOp> {
   }
 };
 
-/// Distributes a subgroup-level vector.extract op to workitem-level. Only
+/// Distributes a subgroup-level vector.extract op to lane-level. Only
 /// handles sub-vector extraction (result is VectorType, not scalar).
-struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
+struct SgToLaneVectorExtract : public OpConversionPattern<vector::ExtractOp> {
   using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
 
   LogicalResult
@@ -1137,8 +1138,9 @@ struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
   }
 };
 
-/// This pattern distributes a subgroup-level ShapeCast op to workitem-level.
-struct SgToWiVectorShapeCast : public OpConversionPattern<vector::ShapeCastOp> {
+/// This pattern distributes a subgroup-level ShapeCast op to lane-level.
+struct SgToLaneVectorShapeCast
+    : public OpConversionPattern<vector::ShapeCastOp> {
   using OpConversionPattern<vector::ShapeCastOp>::OpConversionPattern;
 
   LogicalResult
@@ -1165,9 +1167,9 @@ struct SgToWiVectorShapeCast : public OpConversionPattern<vector::ShapeCastOp> {
 };
 
 /// Distributes a subgroup-level vector.extract_strided_slice op to
-/// workitem-level. If the result is distributed, the offsets and sizes are
+/// lane-level. If the result is distributed, the offsets and sizes are
 /// adjusted to match the distributed types.
-struct SgToWiVectorExtractStridedSlice
+struct SgToLaneVectorExtractStridedSlice
     : public OpConversionPattern<vector::ExtractStridedSliceOp> {
   using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
 
@@ -1257,7 +1259,7 @@ struct SgToWiVectorExtractStridedSlice
 };
 
 /// This pattern distributes a subgroup-level `vector.broadcast` op to
-/// workitem-level. The pattern supports three cases:
+/// lane-level. The pattern supports three cases:
 ///
 /// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
 ///    vector must have a slice layout of the result. If the distributed source
@@ -1313,7 +1315,7 @@ struct SgToWiVectorExtractStridedSlice
 ///   %0 = "some_op"() : f16
 ///   %1 = vector.broadcast %0 : f16 to vector<16x1xf16>
 /// ```
-struct SgToWiBroadcast : public OpConversionPattern<vector::BroadcastOp> {
+struct SgToLaneBroadcast : public OpConversionPattern<vector::BroadcastOp> {
   using OpConversionPattern<vector::BroadcastOp>::OpConversionPattern;
 
   LogicalResult
@@ -1376,9 +1378,9 @@ struct SgToWiBroadcast : public OpConversionPattern<vector::BroadcastOp> {
 };
 
 /// Distributes a subgroup-level vector.insert_strided_slice op to
-/// workitem-level. If the dest is distributed, the offsets are adjusted to
+/// lane-level. If the dest is distributed, the offsets are adjusted to
 /// match the distributed types.
-struct SgToWiVectorInsertStridedSlice
+struct SgToLaneVectorInsertStridedSlice
     : public OpConversionPattern<vector::InsertStridedSliceOp> {
   using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;
 
@@ -1470,9 +1472,9 @@ struct SgToWiVectorInsertStridedSlice
   }
 };
 
-/// Distributes a subgroup-level vector.insert op to workitem-level. Only
+/// Distributes a subgroup-level vector.insert op to lane-level. Only
 /// handles sub-vector insertion (value to store is VectorType, not scalar).
-struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
+struct SgToLaneVectorInsert : public OpConversionPattern<vector::InsertOp> {
   using OpConversionPattern<vector::InsertOp>::OpConversionPattern;
 
   LogicalResult
@@ -1506,7 +1508,7 @@ struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
 };
 
 /// Folds a subgroup-level ConvertLayout op with compatible lane layouts.
-struct SgToWiConvertLayout
+struct SgToLaneConvertLayout
     : public OpConversionPattern<xegpu::ConvertLayoutOp> {
   using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
 
@@ -1536,7 +1538,7 @@ struct SgToWiConvertLayout
 };
 
 // Trivially distribute `vector.interleave`
-struct SgToWiVectorInterleave
+struct SgToLaneVectorInterleave
     : public OpConversionPattern<vector::InterleaveOp> {
   using OpConversionPattern<vector::InterleaveOp>::OpConversionPattern;
 
@@ -1552,7 +1554,7 @@ struct SgToWiVectorInterleave
 };
 
 // Trivially distribute `vector.deinterleave`
-struct SgToWiVectorDeinterleave
+struct SgToLaneVectorDeinterleave
     : public OpConversionPattern<vector::DeinterleaveOp> {
   using OpConversionPattern<vector::DeinterleaveOp>::OpConversionPattern;
 
@@ -1567,7 +1569,7 @@ struct SgToWiVectorDeinterleave
   }
 };
 
-struct SgToWiDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {
+struct SgToLaneDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {
   using OpConversionPattern<xegpu::DpasMxOp>::OpConversionPattern;
 
   LogicalResult
@@ -1682,15 +1684,15 @@ struct SgToWiDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {
   }
 };
 
-struct XeGPUSgToWiDistributeExperimentalPass
-    : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
-          XeGPUSgToWiDistributeExperimentalPass> {
+struct XeGPUSgToLaneDistributePass
+    : public xegpu::impl::XeGPUSgToLaneDistributeBase<
+          XeGPUSgToLaneDistributePass> {
   void runOnOperation() override;
 };
 
 } // namespace
 
-void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
+void XeGPUSgToLaneDistributePass::runOnOperation() {
 
   // Recover temporary operand layouts for usage in patterns.
   Operation *root = getOperation();
@@ -1719,17 +1721,17 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
     RewritePatternSet patterns(&getContext());
     typeConverter.addSourceMaterialization(materializeCast);
     typeConverter.addTargetMaterialization(materializeCast);
-    xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+    xegpu::populateXeGPUSgToLaneDistributeTypeConversions(typeConverter);
     scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
                                                          patterns, target);
-    xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+    xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
         typeConverter, patterns, target);
     target.addLegalOp<UnrealizedConversionCastOp>();
     (void)applyPartialConversion(root, target, std::move(patterns));
   }
   // Structural type conversion can generate some redundant
   // UnrealizedConversionCastOps to materialize the SG type from type converted
-  // WI type. These are redundant at this point and can be eliminated by
+  // lane type. These are redundant at this point and can be eliminated by
   // inserting shape casts instead.
   // Example:
   // %1 = UnrealizedConversionCastOp %0 : vector<16x1xf32> to vector<16x16xf32>
@@ -1790,7 +1792,7 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
   xegpu::removeTemporaryLayoutAttrs(getOperation());
 }
 
-void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
+void xegpu::populateXeGPUSgToLaneDistributeTypeConversions(
     TypeConverter &typeConverter) {
   // Any type other than TensorDescType and VectorType are legal as is.
   typeConverter.addConversion([](Type type) -> std::optional<Type> {
@@ -1824,10 +1826,10 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
   });
 }
 
-void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+void xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target) {
-  populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+  populateXeGPUSgToLaneDistributeTypeConversions(typeConverter);
   // CreateNdDescOp is legal only if its result type has no layout attribute.
   target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
       [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
@@ -1914,16 +1916,17 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
       });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
-  patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
-               SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
-               SgToWiLoadGather, SgToWiStoreScatter, SgToWiVectorReduction,
-               SgToWiMultiDimReduction, SgToWiVectorExtract, SgToWiVectorInsert,
-               SgToWiVectorExtractStridedSlice, SgToWiVectorInsertStridedSlice,
-               SgToWiLoadMatrix, SgToWiStoreMatrix, SgToWiConvertLayout,
-               SgToWiVectorTranspose, SgToWiVectorBitcast, SgToWiVectorStep,
-               SgToWiVectorShapeCast, SgToWiBroadcast,
-               SgToWiCreateMask<vector::CreateMaskOp>,
-               SgToWiCreateMask<vector::ConstantMaskOp>,
-               SgToWiVectorDeinterleave, SgToWiVectorInterleave, SgToWiDpasMx>(
-      typeConverter, patterns.getContext());
+  patterns.add<
+      SgToLaneCreateNdDesc, SgToLaneLoadNd, SgToLaneStoreNd, SgToLaneDpas,
+      SgToLaneElementWise, SgToLaneArithConstant, SgToLanePrefetchNd,
+      SgToLaneLoadGather, SgToLaneStoreScatter, SgToLaneVectorReduction,
+      SgToLaneMultiDimReduction, SgToLaneVectorExtract, SgToLaneVectorInsert,
+      SgToLaneVectorExtractStridedSlice, SgToLaneVectorInsertStridedSlice,
+      SgToLaneLoadMatrix, SgToLaneStoreMatrix, SgToLaneConvertLayout,
+      SgToLaneVectorTranspose, SgToLaneVectorBitcast, SgToLaneVectorStep,
+      SgToLaneVectorShapeCast, SgToLaneBroadcast,
+      SgToLaneCreateMask<vector::CreateMaskOp>,
+      SgToLaneCreateMask<vector::ConstantMaskOp>, SgToLaneVectorDeinterleave,
+      SgToLaneVectorInterleave, SgToLaneDpasMx>(typeConverter,
+                                                patterns.getContext());
 }

diff  --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
deleted file mode 100644
index 1b4dddcb4ae55..0000000000000
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ /dev/null
@@ -1,2280 +0,0 @@
-//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/Affine/Utils.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
-#include "mlir/Dialect/Index/IR/IndexDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
-#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
-#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
-#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
-#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeRange.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/InliningUtils.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SmallVectorExtras.h"
-
-namespace mlir {
-namespace xegpu {
-#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
-} // namespace xegpu
-} // namespace mlir
-
-#define DEBUG_TYPE "xegpu-subgroup-distribute"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-using namespace mlir;
-
-static const char *const resolveSIMTTypeMismatch =
-    "resolve_simt_type_mismatch"; // Attribute name for identifying
-                                  // UnrelizedConversionCastOp added to resolve
-                                  // SIMT type mismatches.
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// SIMT Distribution Patterns
-//===----------------------------------------------------------------------===//
-
-/// In certain cases, we may need to favor XeGPU specific distribution patterns
-/// over generic vector distribution patterns. In such cases, we can assign
-/// priorities to patterns.
-enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };
-
-/// Helper function to resolve types if the distributed type out of
-/// gpu.warp_execute_on_lane0 is 
diff erent from the expected xegpu SIMT type.
-/// Example 1:
-///   distributed type: vector<8x1xf32>
-///   expected type: vector<8xf32>
-///   resolved using,
-///   %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
-/// Example 2:
-///   distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
-///   expected type: xegpu.tensor_desc<8x16xf32>
-///   resolved using,
-///   %0 = unrealized_conversion_cast %1 :
-///      xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
-///      xegpu.tensor_desc<8x16xf32>
-template <typename T>
-static Value resolveDistributedTy(Value orig, T expected,
-                                  PatternRewriter &rewriter) {
-  // If orig and expected types are the same, return orig.
-  if (orig.getType() == expected)
-    return orig;
-  // If orig is a vector type, create a shape cast op to reconcile the types.
-  if (isa<VectorType>(orig.getType())) {
-    auto castOp =
-        vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
-    return castOp.getResult();
-  }
-  // If orig is a tensor descriptor type, create an unrealized conversion cast
-  // op to reconcile the types.
-  if (isa<xegpu::TensorDescType>(orig.getType())) {
-    auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
-                                                     expected, orig);
-    castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
-    return castOp.getResult(0);
-  }
-  llvm_unreachable("Unsupported type for reconciliation");
-  return orig;
-}
-
-/// Given a vector type and its distributed vector type, return the list of
-/// dimensions that are distributed.
-static SmallVector<int64_t> getDistributedDims(VectorType originalType,
-                                               VectorType distributedType) {
-  assert(originalType.getRank() == distributedType.getRank() &&
-         "sequential and distributed vector types must have the same rank");
-  SmallVector<int64_t> distributedDims;
-  for (int64_t i = 0; i < originalType.getRank(); ++i) {
-    if (distributedType.getDimSize(i) != originalType.getDimSize(i)) {
-      distributedDims.push_back(i);
-    }
-  }
-  return distributedDims;
-}
-
-/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
-/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
-/// contained within a WarpExecuteOnLane0Op.
-/// Example:
-///
-/// ```
-///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
-///     ...
-///     ...
-///     gpu.return %result: vector<8x16xf32>
-///   }
-/// ```
-/// To
-/// ```
-///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
-///     %laneid = gpu.lane_id : index
-///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
-///       ...
-///       ...
-///       gpu.yield %result: vector<8x16xf32>
-///     }
-///     return %0
-///   }
-struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
-  using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
-                                PatternRewriter &rewriter) const override {
-    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
-    if (!uArch)
-      return rewriter.notifyMatchFailure(
-          gpuFuncOp, "Subgroup distribution requires target attribute attached "
-                     "to set the warp size");
-    if (!gpuFuncOp.getBody().hasOneBlock())
-      return rewriter.notifyMatchFailure(
-          gpuFuncOp, "expected gpu.func to have a single block");
-
-    // If the function only contains a single void return, skip.
-    if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
-          return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
-        }))
-      return failure();
-    // If the function already moved inside a warp_execute_on_lane0, skip.
-    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
-          return isa<gpu::WarpExecuteOnLane0Op>(op);
-        }))
-      return failure();
-    gpu::ReturnOp origReturnOp = dyn_cast_if_present<gpu::ReturnOp>(
-        gpuFuncOp.getBlocks().back().getTerminator());
-    if (!origReturnOp)
-      return rewriter.notifyMatchFailure(
-          gpuFuncOp, "expected gpu.func terminator to be gpu.return");
-    // Create a new function with the same signature and same attributes.
-    SmallVector<Type> workgroupAttributionsTypes =
-        llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributionBBArgs(),
-                            [](BlockArgument arg) { return arg.getType(); });
-    SmallVector<Type> privateAttributionsTypes =
-        llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
-                            [](BlockArgument arg) { return arg.getType(); });
-    auto newGpuFunc = gpu::GPUFuncOp::create(
-        rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
-        gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
-        privateAttributionsTypes);
-    newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
-    // Create a WarpExecuteOnLane0Op with same arguments and results as the
-    // original gpuFuncOp.
-    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
-    auto laneId = gpu::LaneIdOp::create(
-        rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
-        /** upperBound = **/ mlir::IntegerAttr());
-    ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
-    auto warpOp = gpu::WarpExecuteOnLane0Op::create(
-        rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
-        uArch->getSubgroupSize(), newGpuFunc.getArguments(),
-        newGpuFunc.getArgumentTypes());
-    Block &warpBodyBlock = warpOp.getBodyRegion().front();
-    // Replace the ReturnOp of the original gpu function with a YieldOp.
-    rewriter.setInsertionPointAfter(origReturnOp);
-    gpu::YieldOp::create(rewriter, origReturnOp.getLoc(),
-                         origReturnOp.getOperands());
-    rewriter.eraseOp(origReturnOp);
-    // Move the original function body to the WarpExecuteOnLane0Op body.
-    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
-                                warpOp.getBodyRegion().begin());
-    rewriter.eraseBlock(&warpBodyBlock);
-    // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
-    rewriter.setInsertionPointAfter(warpOp);
-    gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
-    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
-    return success();
-  }
-};
-
-/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
-/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
-/// still contain the original op that will not be used by the yield op (and
-/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
-/// arguments. Tensor descriptor shape is not distributed because it is a
-/// uniform value across all work items within the subgroup. However, the
-/// layout information is dropped in the new tensor descriptor type.
-///
-/// Example:
-///
-/// ```
-///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
-///                   (!xegpu.tensor_desc<4x8xf32, #layout0>) {
-///     ...
-///     %td = xegpu.create_nd_tdesc %arg0
-///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
-///     vector.yield %td
-///   }
-/// ```
-/// To
-/// ```
-///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
-///     ...
-///     %dead = xegpu.create_nd_tdesc %arg0
-///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
-///     vector.yield %arg0, %dead
-///   }
-///   %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
-///                                 -> !xegpu.tensor_desc<4x8xf32>
-///
-/// ```
-struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(
-          warpOp, "warp result is not a xegpu::CreateNdDesc op");
-    auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-
-    xegpu::DistributeLayoutAttr layout = descOp.getType().getLayoutAttr();
-    if (!layout)
-      return rewriter.notifyMatchFailure(
-          descOp, "the tensor descriptor lacks layout attribute");
-    SmallVector<size_t> newRetIndices;
-    rewriter.setInsertionPoint(warpOp);
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
-        /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
-
-    SmallVector<Value> newDescOperands = llvm::map_to_vector(
-        newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
-    rewriter.setInsertionPointAfter(newWarpOp);
-    xegpu::TensorDescType distributedTensorDescTy =
-        descOp.getType().dropLayouts(); // Distributed tensor descriptor type
-                                        // does not contain layout info.
-    Value newDescOp = xegpu::CreateNdDescOp::create(
-        rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
-        descOp->getAttrs());
-
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    // Resolve the distributed type to the expected type.
-    newDescOp =
-        resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, newDescOp);
-    return success();
-  }
-};
-
-/// Distribute a store_nd op at the end of enclosing
-/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
-/// through the warp op interface they would be propagated as returned values.
-/// Source vector is distributed based on lane layout. Appropriate cast ops are
-/// inserted if the distributed types does not match expected xegpu SIMT types.
-///
-/// Example:
-///
-/// ```
-///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-///   gpu.warp_execute_on_lane_0(%laneid) -> () {
-///     ...
-///     xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
-///                                 !xegpu.tensor_desc<4x8xf32, #layout0>
-///   }
-/// ```
-/// To
-/// ```
-///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
-///   !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
-///     ...
-///     gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
-///     !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
-///   }
-///   %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
-///   %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
-///   #layout0>
-///     -> !xegpu.tensor_desc<4x8xf32>
-///   xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
-///     !xegpu.tensor_desc<4x8xf32>
-///
-/// ```
-struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    gpu::YieldOp yield = warpOp.getTerminator();
-    Operation *lastNode = yield->getPrevNode();
-    auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
-    if (!storeOp)
-      return failure();
-
-    SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
-    // Expecting offsets to be present.
-    if (offsets.empty())
-      return rewriter.notifyMatchFailure(storeOp,
-                                         "the store op must have offsets");
-    SmallVector<Value> offsetsAsValues =
-        vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
-    SmallVector<Type> offsetTypes = llvm::map_to_vector(
-        offsetsAsValues, [](Value v) { return v.getType(); });
-    xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
-    xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
-    if (!layout)
-      return rewriter.notifyMatchFailure(
-          storeOp, "the source tensor descriptor lacks layout attribute");
-
-    FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
-        xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
-    if (failed(distributedTypeByWarpOpOrFailure))
-      return rewriter.notifyMatchFailure(storeOp,
-                                         "Failed to distribute the type");
-    VectorType distributedTypeByWarpOp =
-        distributedTypeByWarpOpOrFailure.value();
-
-    SmallVector<size_t> newRetIndices;
-    SmallVector<Value> newYieldedValues = {storeOp.getValue(),
-                                           storeOp.getTensorDesc()};
-    SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
-    newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
-    newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
-    // Create a new store op outside the warp op with the distributed vector
-    // type. Tensor descriptor is not distributed.
-    rewriter.setInsertionPointAfter(newWarpOp);
-    SmallVector<Value> newStoreOperands;
-
-    // For the value operand, there can be a mismatch between the vector type
-    // distributed by the warp op and (xegpu-specific) distributed type
-    // supported by the store op. Type mismatch must be resolved using
-    // appropriate cast op.
-    FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
-        xegpu::getDistributedVectorType(storeOp.getTensorDescType());
-    if (failed(storeNdDistributedValueTyOrFailure))
-      return rewriter.notifyMatchFailure(
-          storeOp, "Failed to get distributed vector type for the store op");
-    newStoreOperands.push_back(resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[0]),
-        storeNdDistributedValueTyOrFailure.value(), rewriter));
-    // For the tensor descriptor operand, the layout attribute is dropped after
-    // distribution. Types needs to be resolved in this case also.
-    xegpu::TensorDescType distributedTensorDescTy =
-        storeOp.getTensorDescType().dropLayouts();
-    newStoreOperands.push_back(
-        resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
-                             distributedTensorDescTy, rewriter));
-    // Collect offsets.
-    for (size_t i = 2; i < newRetIndices.size(); ++i)
-      newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
-
-    auto newStoreOp =
-        xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
-                                 newStoreOperands, storeOp->getAttrs());
-    xegpu::removeLayoutAttrs(newStoreOp);
-    rewriter.eraseOp(storeOp);
-    return success();
-  }
-};
-
-/// Distribute a load_nd op feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
-/// The warp op will still contain the original op that will not be used by
-/// the yield op (and should be cleaned up later). The yield op will
-/// bypass the load's arguments. Only the loaded vector is distributed
-/// according to lane layout and, tensor descriptor types is not
-/// distributed. Appropriate cast ops are inserted if the distributed types does
-/// not match expected xegpu SIMT types.
-///
-/// Example:
-///
-/// ```
-///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
-///                   (vector<4x1xf32>) {
-///     ...
-///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
-///     ->
-///       vector<4x8xf32>
-///     gpu.yield %ld
-///   }
-/// ```
-/// To
-/// ```
-///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
-///   !xegpu.tensor_desc<4x8xf32, #layout0>) {
-///     ...
-///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
-///     vector<4x8xf32> gpu.yield %dead, %arg0
-///   }
-///   %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
-///        #layout0> -> !xegpu.tensor_desc<4x8xf32>
-///   %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
-///   %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
-///
-/// ```
-struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
-      if (!isa<xegpu::LoadNdOp>(op))
-        return false;
-      // Make sure the same load op is the last operation in the warp op body.
-      // This ensure that load op is not sinked earlier violating any barrier
-      // synchronizations.
-      gpu::YieldOp yield = warpOp.getTerminator();
-      return yield->getPrevNode() == op;
-    });
-
-    if (!operand)
-      return rewriter.notifyMatchFailure(
-          warpOp, "warp result is not a xegpu::LoadNd op");
-
-    auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
-    auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
-    if (!uArch)
-      return rewriter.notifyMatchFailure(
-          loadOp, "xegpu::LoadNdOp require target attribute attached to "
-                  "determine transpose "
-                  "requirement");
-    // Chip information is required to decide if the layout requires transpose
-    // effect.
-    // Expecting offsets to be present.
-    SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
-    if (offsets.empty())
-      return rewriter.notifyMatchFailure(loadOp,
-                                         "the load op must have offsets");
-    SmallVector<Value> offsetsAsValues =
-        vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
-    SmallVector<Type> offsetTypes = llvm::map_to_vector(
-        offsetsAsValues, [](Value v) { return v.getType(); });
-
-    xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
-    xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
-    if (!layout)
-      return rewriter.notifyMatchFailure(
-          loadOp, "the source tensor descriptor lacks layout attribute");
-
-    unsigned operandIdx = operand->getOperandNumber();
-    VectorType distributedTypeByWarpOp =
-        cast<VectorType>(warpOp.getResult(operandIdx).getType());
-
-    SmallVector<size_t> newRetIndices;
-    SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
-    SmallVector<Type> newYieldedTypes = {tensorDescTy};
-    newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
-    newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
-
-    // Create a new load op outside the warp op with the distributed vector
-    // type.
-    rewriter.setInsertionPointAfter(newWarpOp);
-    FailureOr<VectorType> loadNdDistValueTyOrFailure =
-        xegpu::getDistributedVectorType(loadOp.getTensorDescType());
-    if (failed(loadNdDistValueTyOrFailure))
-      return rewriter.notifyMatchFailure(
-          loadOp, "Failed to get distributed vector type for the load op");
-    xegpu::TensorDescType distributedTensorDescTy =
-        loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
-                                                  // descriptor type does not
-                                                  // contain layout info.
-    SmallVector<Value> newLoadOperands{
-        resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
-                             distributedTensorDescTy, rewriter)};
-    // Collect offsets.
-    for (size_t i = 1; i < newRetIndices.size(); ++i)
-      newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
-    auto newLoadOp = xegpu::LoadNdOp::create(
-        rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
-        newLoadOperands, loadOp->getAttrs());
-    xegpu::removeLayoutAttrs(newLoadOp);
-    // Set the packed attribute if the layout requires it.
-    newLoadOp.setPacked(xegpu::requirePacked(layout));
-    // Set the transpose attribute if the layout requires it.
-    if (xegpu::requireTranspose(layout, uArch))
-      newLoadOp.setTranspose(
-          DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    // There can be a conflict between the vector type distributed by the
-    // warp op and (xegpu-specific) distributed type supported by the load
-    // op. Resolve these mismatches by inserting a cast.
-    Value tyResolvedVal = resolveDistributedTy(
-        newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
-    return success();
-  }
-};
-
-/// Distribute a dpas op feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
-/// The warp op will still contain the original op that will not be used by
-/// the yield op (and should be cleaned up later). The yield op will
-/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
-/// distributed types does not match expected xegpu SIMT types.
-/// Example:
-/// ```
-///   #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
-///   #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
-///   #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
-///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
-///                   (vector<8x1xf32>) {
-///     ...
-///     %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
-///       vector<8x16xf32>
-///     gpu.yield %dpas
-///   }
-/// ```
-/// To
-/// ```
-///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
-///   vector<8x1xf16>, vector<16x1xf16>) {
-///     ...
-///     %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
-///       -> vector<8x16xf32>
-///     gpu.yield %dead, %arg0, %arg1
-///   }
-///   %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
-///   %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
-///   %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
-///     vector<8xf32>
-///   %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
-/// ```
-struct DpasDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(warpOp,
-                                         "warp result is not a xegpu::Dpas op");
-
-    auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-
-    xegpu::LayoutAttr layoutA =
-        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
-    xegpu::LayoutAttr layoutB =
-        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
-    xegpu::LayoutAttr layoutOut =
-        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
-
-    if (!layoutA || !layoutB || !layoutOut)
-      return rewriter.notifyMatchFailure(
-          dpasOp,
-          "the xegpu::Dpas op lacks layout attribute for A, B or output");
-
-    FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
-    FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
-    FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
-
-    if (failed(distLhsTypeByWarpOpOrFailure) ||
-        failed(distRhsTypeByWarpOpOrFailure) ||
-        failed(distResultTypeByWarpOpOrFailure))
-      return rewriter.notifyMatchFailure(
-          dpasOp,
-          "Failed to distribute the A, B or output types in xegpu::Dpas op");
-
-    llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
-                                               dpasOp.getRhs()};
-    llvm::SmallVector<Type, 3> newYieldTypes{
-        distLhsTypeByWarpOpOrFailure.value(),
-        distRhsTypeByWarpOpOrFailure.value()};
-    // Dpas acc operand is optional.
-    if (dpasOp.getAcc()) {
-      newYieldValues.push_back(dpasOp.getAcc());
-      newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
-    }
-    // Create a new warp op without the dpas.
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
-
-    FailureOr<VectorType> expectedDistLhsTyOrFailure =
-        xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
-    FailureOr<VectorType> expectedDistRhsTyOrFailure =
-        xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
-    FailureOr<VectorType> expectedDistResultTyOrFailure =
-        xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
-
-    if (failed(expectedDistLhsTyOrFailure) ||
-        failed(expectedDistRhsTyOrFailure) ||
-        failed(expectedDistResultTyOrFailure))
-      return rewriter.notifyMatchFailure(
-          dpasOp,
-          "Failed to get distributed vector type for the dpas operands.");
-    // Create a new dpas op outside the warp op.
-    rewriter.setInsertionPointAfter(newWarpOp);
-    SmallVector<Value> newDpasOperands;
-    SmallVector<VectorType> newDpasOperandExpectedTypes;
-
-    // Resolve the distributed types with the original types.
-    newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
-    newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
-    VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
-    if (dpasOp.getAcc())
-      newDpasOperandExpectedTypes.push_back(distributedResultTy);
-
-    for (unsigned i = 0; i < newRetIndices.size(); i++) {
-      newDpasOperands.push_back(
-          resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
-                               newDpasOperandExpectedTypes[i], rewriter));
-    }
-    auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
-                                           distributedResultTy, newDpasOperands,
-                                           dpasOp->getAttrs());
-    xegpu::removeLayoutAttrs(newDpasOp);
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    // Resolve the output type.
-    Value typeResolved =
-        resolveDistributedTy(newDpasOp.getResult(),
-                             distResultTypeByWarpOpOrFailure.value(), rewriter);
-    rewriter.replaceAllUsesWith(distributedVal, typeResolved);
-    return success();
-  }
-};
-
-/// Distribute a prefetch_nd op at the end of enclosing
-/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
-/// through the warp op interface they would be propagated as returned values.
-/// Tensor descriptor shape is not distributed because it is a uniform value
-/// across all work items within the subgroup. Appropriate cast ops are inserted
-/// if the distributed types does not match expected xegpu SIMT types.
-///
-/// Example:
-///
-/// ```
-///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-///   gpu.warp_execute_on_lane_0(%laneid) -> () {
-///     ...
-///     xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
-///   }
-/// ```
-/// To
-/// ```
-///   %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
-///    !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
-///     gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
-///     index
-///   }
-///   %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
-///     #layout0> -> !xegpu.tensor_desc<4x8xf32>
-///   xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
-///
-/// ```
-struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    gpu::YieldOp yield = warpOp.getTerminator();
-    Operation *lastNode = yield->getPrevNode();
-    auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
-    if (!prefetchOp)
-      return failure();
-
-    SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
-    // PrefetchNdOp must have offsets.
-    if (offsets.empty())
-      return rewriter.notifyMatchFailure(prefetchOp,
-                                         "the prefetch op must have offsets");
-    SmallVector<Value> offsetsAsValues =
-        vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
-    SmallVector<Type> offsetTypes = llvm::map_to_vector(
-        offsetsAsValues, [](Value v) { return v.getType(); });
-
-    xegpu::DistributeLayoutAttr layout =
-        prefetchOp.getTensorDescType().getLayoutAttr();
-    if (!layout)
-      return rewriter.notifyMatchFailure(
-          prefetchOp, "the source tensor descriptor lacks layout attribute");
-
-    SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
-    SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
-    newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
-    newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
-    // Create a new prefetch op outside the warp op with updated tensor
-    // descriptor type. Source tensor descriptor require type resolution.
-    xegpu::TensorDescType newTensorDescTy =
-        prefetchOp.getTensorDescType().dropLayouts();
-    rewriter.setInsertionPointAfter(newWarpOp);
-    SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
-    // Collect offsets.
-    for (size_t i = 1; i < newRetIndices.size(); ++i)
-      newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
-    Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
-        rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
-        prefetchOp->getAttrs());
-    xegpu::removeLayoutAttrs(newPrefetchOp);
-    rewriter.eraseOp(prefetchOp);
-    return success();
-  }
-};
-
-/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
-/// region. This will simply move the barrier op outside of the warp op.
-struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    gpu::YieldOp yield = warpOp.getTerminator();
-    Operation *lastNode = yield->getPrevNode();
-    // The last node must be a gpu::BarrierOp.
-    auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
-    if (!barrierOp)
-      return failure();
-    // Move the barrier op outside of the warp op.
-    rewriter.setInsertionPointAfter(warpOp);
-    gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
-                           barrierOp->getResultTypes(),
-                           barrierOp->getOperands(), barrierOp->getAttrs());
-    rewriter.eraseOp(barrierOp);
-    return success();
-  }
-};
-
-/// Distribute a scattered store op. The offsets argument is required.
-/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
-/// The layouts are fixed and implicit: one offset/mask per lane.
-/// The pass changes the offset/mask vector shapes to a
-/// single-element vector, **it is assumed that their producer will also be
-/// distributed**. The payload vector also has a fixed distribution:
-///   no chunk size -> vector of one element.
-///   chunk size    -> vector of the innermost dimension of the SG-payload.
-/// Example 1 (no chunk size):
-///    %mask = producer_op : vector<16xi1>
-///    %offset = producer_op : vector<16xindex>
-///    xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
-///     memref<256xf16>, vector<16xindex>, vector<16xi1>
-/// To
-///    %mask = producer_op : vector<1xi1>
-///    %offset = producer_op : vector<1xindex>
-///    xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
-///     memref<256xf16>, vector<1xindex>, vector<1xi1>
-/// Example 2 (chunk size, same mask and offsets):
-///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
-///     vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-/// To
-///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
-///     vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-///
-/// Note that the store distribution pattern also handles leading unit
-/// dimensions in the payload, mask and offsets vectors. In this case the store
-/// distribution will only change the dimensions corresponding to the SG
-/// distribution and keep the leading unit dimensions unchanged.
-/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
-/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
-/// offset/mask/payload when necessary so that the distributed store is workign
-/// on 1D shape vector to match the HW capability.
-struct StoreDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    Operation *lastNode = warpOp.getTerminator()->getPrevNode();
-    auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
-    if (!storeScatterOp)
-      return failure();
-    Value offsets = storeScatterOp.getOffsets();
-    if (!isa<VectorType>(offsets.getType()))
-      return rewriter.notifyMatchFailure(
-          storeScatterOp, "Store op must have a vector of offsets argument");
-    VectorType offsetsTy = cast<VectorType>(offsets.getType());
-    VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
-    VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
-
-    // Add handling for leading unit dimensions support
-    int chunkSize = storeScatterOp.getChunkSize().value_or(1);
-    int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
-
-    // Check that all leading dimensions are unit dimensions
-    for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
-      if (storeVecTy.getShape()[i] != 1) {
-        return rewriter.notifyMatchFailure(
-            storeScatterOp, "Only unit dimensions allowed for the leading "
-                            "dimensions of the store vector!");
-      }
-    }
-
-    auto layoutPayload = storeScatterOp.getLayoutAttr();
-    auto layoutOffsets =
-        xegpu::inferMaskOffsetLayoutForScatterIO(layoutPayload, chunkSize);
-    auto layoutMask = layoutOffsets;
-
-    FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
-    FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
-    FailureOr<VectorType> distMaskByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
-    if (failed(distStoreVecByWarpOpOrFailure) ||
-        failed(distOffsetsByWarpOpOrFailure) ||
-        failed(distMaskByWarpOpOrFailure)) {
-      return rewriter.notifyMatchFailure(
-          storeScatterOp,
-          "Some vector operands have no layouts, using defaults instead.");
-    }
-
-    VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
-    VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
-    VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
-
-    SmallVector<size_t> newRetIndices;
-    SmallVector<Value> operands = storeScatterOp->getOperands();
-    SmallVector<Type> operandTypesToYield = {
-        distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};
-
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
-
-    rewriter.setInsertionPointAfter(newWarpOp);
-
-    // Distributed store payload type is always 1D without leading unit dims
-    VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
-                                             distPayloadTy.getElementType());
-
-    VectorType distOffsetsTy1D = VectorType::get(
-        {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
-    VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
-                                              distMaskTy.getElementType());
-
-    // Resolve distributed types to 1D for SIMT execution
-    Value distPayloadVal = resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
-    Value distOffsetVal = resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
-    Value distMaskVal = resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);
-
-    SmallVector<Value> newStoreScatterOpOperands = {
-        distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
-        distMaskVal};
-
-    xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
-        rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
-        storeScatterOp->getAttrs());
-    xegpu::removeLayoutAttrs(newOp);
-    rewriter.eraseOp(storeScatterOp);
-    return success();
-  }
-};
-
-static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
-    PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
-    Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
-  SmallVector<Value> newCoods;
-  auto maybeCoords =
-      layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
-  if (failed(maybeCoords))
-    return {};
-  assert(maybeCoords.value().size() == 1 &&
-         "Expected one set of distributed offsets");
-  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
-      rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
-      getAsOpFoldResult(origOffsets));
-  newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
-  return newCoods;
-}
-
-/// Pattern for distributing xegpu::LoadMatrixOp.
-struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    gpu::YieldOp yield = warpOp.getTerminator();
-    Operation *lastNode = yield->getPrevNode();
-    auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
-    if (!matrixOp)
-      return failure();
-
-    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
-      return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
-    });
-    if (!producedByLastLoad)
-      return rewriter.notifyMatchFailure(
-          warpOp, "The last op is not xegpu::LoadMatrixOp");
-    const int operandIdx = producedByLastLoad->getOperandNumber();
-
-    VectorType sgPayloadTy =
-        dyn_cast<VectorType>(matrixOp.getResult().getType());
-    VectorType warpResultTy =
-        cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    if (!sgPayloadTy)
-      return rewriter.notifyMatchFailure(
-          matrixOp, "the matrix op payload must be a vector type");
-
-    auto loc = matrixOp.getLoc();
-    auto offsets = matrixOp.getMixedOffsets();
-    if (offsets.empty())
-      return rewriter.notifyMatchFailure(matrixOp,
-                                         "the load op must have offsets");
-    SmallVector<Value> offsetsAsValues =
-        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
-
-    auto layout = matrixOp.getLayoutAttr();
-    if (!layout)
-      return rewriter.notifyMatchFailure(
-          matrixOp, "the matrix operation lacks layout attribute");
-
-    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
-    if (failed(distPayloadByWarpOpOrFailure))
-      return rewriter.notifyMatchFailure(
-          matrixOp, "Failed to distribute matrix op payload based on layout.");
-
-    SmallVector<Value> operands = {matrixOp.getMemDesc()};
-    const unsigned offsetsStartIdx = operands.size();
-    operands.append(offsetsAsValues);
-
-    SmallVector<Type> operandTypes =
-        llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
-
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, operands, operandTypes, newRetIndices);
-    SmallVector<Value> newOperands = llvm::map_to_vector(
-        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
-    SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
-                                         ShapedType::kDynamic);
-    DenseI64ArrayAttr newConstOffsetsAttr =
-        rewriter.getDenseI64ArrayAttr(newConstOffsets);
-    ValueRange currentOffsets =
-        ValueRange(newOperands).drop_front(offsetsStartIdx);
-
-    SmallVector<Value> newCoords = currentOffsets;
-    rewriter.setInsertionPointAfter(newWarpOp);
-
-    if (!matrixOp.getSubgroupBlockIoAttr()) {
-      newCoords = computeDistributedCoordinatesForMatrixOp(
-          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
-          currentOffsets);
-    }
-    xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
-        rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
-        newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
-        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
-    // Resolve the output type and replace all uses.
-    rewriter.replaceAllUsesWith(
-        newWarpOp.getResult(operandIdx),
-        resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
-    return success();
-  }
-};
-
-/// Pattern for distributing xegpu::StoreMatrixOp.
-struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    gpu::YieldOp yield = warpOp.getTerminator();
-    Operation *lastNode = yield->getPrevNode();
-    auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
-    if (!matrixOp)
-      return failure();
-
-    VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
-    if (!sgPayloadTy)
-      return rewriter.notifyMatchFailure(
-          matrixOp, "the matrix op payload must be a vector type");
-
-    auto loc = matrixOp.getLoc();
-    auto offsets = matrixOp.getMixedOffsets();
-    if (offsets.empty())
-      return rewriter.notifyMatchFailure(matrixOp,
-                                         "the store op must have offsets");
-    SmallVector<Value> offsetsAsValues =
-        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
-
-    auto layout = matrixOp.getLayoutAttr();
-    if (!layout)
-      return rewriter.notifyMatchFailure(
-          matrixOp, "the matrix operation lacks layout attribute");
-
-    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
-    if (failed(distPayloadByWarpOpOrFailure))
-      return rewriter.notifyMatchFailure(
-          matrixOp, "Failed to distribute matrix op payload based on layout.");
-
-    SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
-    const unsigned offsetsStartIdx = operands.size();
-    operands.append(offsetsAsValues);
-
-    SmallVector<Type> operandTypes =
-        llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
-    operandTypes[0] = *distPayloadByWarpOpOrFailure;
-
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, operands, operandTypes, newRetIndices);
-    SmallVector<Value> newOperands = llvm::map_to_vector(
-        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
-    SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
-                                         ShapedType::kDynamic);
-    DenseI64ArrayAttr newConstOffsetsAttr =
-        rewriter.getDenseI64ArrayAttr(newConstOffsets);
-    ValueRange currentOffsets =
-        ValueRange(newOperands).drop_front(offsetsStartIdx);
-
-    SmallVector<Value> newCoords = currentOffsets;
-    rewriter.setInsertionPointAfter(newWarpOp);
-
-    if (!matrixOp.getSubgroupBlockIoAttr()) {
-      newCoords = computeDistributedCoordinatesForMatrixOp(
-          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
-          currentOffsets);
-    }
-
-    xegpu::StoreMatrixOp::create(
-        rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
-        ValueRange(newCoords), newConstOffsetsAttr,
-        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
-    rewriter.eraseOp(matrixOp);
-    return success();
-  }
-};
-
-/// Distribute a scattered load op. The logic and requirements are the same as
-/// for the scattered store distribution. The warpOp's payload vector is
-/// expected to be distributed by the load's result consumer.
-/// Example 1 (no chunk size):
-///    %mask = producer_op : vector<16xi1>
-///    %offset = producer_op : vector<16xindex>
-///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
-///    vector<16xindex>, vector<16xi1> -> vector<16xf16>
-/// To
-///    %mask = producer_op : vector<1xi1>
-///    %offset = producer_op : vector<1xindex>
-///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
-///     vector<1xindex>, vector<1xi1> -> vector<1xf16>
-/// Example 2 (chunk size, same mask and offsets):
-///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
-///     memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-/// To
-///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
-///     memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-///
-/// Note that the load distribution pattern also handles leading unit dimensions
-/// in the payload, mask, and offsets vector.The load distribution will only
-/// change the dimensions corresponding to the SG distribution and keep the
-/// leading unit dimensions unchanged. For example, a load with result type
-/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
-/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
-/// offset/mask/payload when necessary so that the distributed load is workign
-/// on 1D shape vector to match the HW capability.
-struct LoadDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
-      // Check if the yield operand that was produced by the *last* scattered
-      // load op to avoid sinking it before barriers (maintain memory order).
-      return isa<xegpu::LoadGatherOp>(op) &&
-             warpOp.getTerminator()->getPrevNode() == op;
-    });
-    if (!producedByLastLoad)
-      return rewriter.notifyMatchFailure(
-          warpOp, "The last op is not xegpu::LoadGatherOp");
-
-    auto loadGatherOp =
-        producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
-    Value offsets = loadGatherOp.getOffsets();
-    if (!isa<VectorType>(offsets.getType()) ||
-        !isa<VectorType>(loadGatherOp.getMask().getType()))
-      return rewriter.notifyMatchFailure(
-          loadGatherOp,
-          "Load op must have vector arguments for offsets and mask");
-    VectorType offsetsTy = cast<VectorType>(offsets.getType());
-    VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
-    VectorType resultVecTy =
-        cast<VectorType>(loadGatherOp.getResult().getType());
-    // add handling leading unit dimensions support
-    int chunkSize = loadGatherOp.getChunkSize().value_or(1);
-    int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
-    for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
-      if (resultVecTy.getShape()[i] != 1) {
-        return rewriter.notifyMatchFailure(
-            loadGatherOp, "Only unit dimensions allowed for the leading "
-                          "dimensions of the load vector!");
-      }
-    }
-
-    auto layoutPayload = loadGatherOp.getLayoutAttr();
-    auto layoutOffsets =
-        xegpu::inferMaskOffsetLayoutForScatterIO(layoutPayload, chunkSize);
-    auto layoutMask = layoutOffsets;
-
-    FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
-    FailureOr<VectorType> distMaskByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
-    if (failed(distOffsetsByWarpOpOrFailure) ||
-        failed(distMaskByWarpOpOrFailure)) {
-      return rewriter.notifyMatchFailure(
-          loadGatherOp,
-          "Some vector operands have no layouts, using defaults instead.");
-    }
-
-    SmallVector<size_t> newRetIndices;
-    SmallVector<Value> operands = loadGatherOp->getOperands();
-
-    const unsigned operandIdx = producedByLastLoad->getOperandNumber();
-    VectorType distResultTy =
-        cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
-    VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
-
-    SmallVector<Type> operandTypesToYield = {operands[0].getType(),
-                                             distOffsetsTy, distMaskTy};
-
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
-
-    rewriter.setInsertionPointAfter(newWarpOp);
-
-    // Distributed load op will always be 1D.
-    VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
-                                             distResultTy.getElementType());
-
-    VectorType distOffsetsTy1D =
-        VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
-                        distOffsetsByWarpOpOrFailure.value().getElementType());
-    VectorType distMaskTy1D =
-        VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
-                        distMaskByWarpOpOrFailure.value().getElementType());
-
-    Value distOffsetVal = resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
-    Value distmaskVal = resolveDistributedTy(
-        newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
-
-    SmallVector<Value> newLoadGatherOperands = {
-        newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
-
-    xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
-        rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
-        loadGatherOp->getAttrs());
-    xegpu::removeLayoutAttrs(newOp);
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    // Resolve the output type and replace all uses.
-    rewriter.replaceAllUsesWith(
-        distributedVal,
-        resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
-    return success();
-  }
-};
-
-// Sink SG-uniform ops. An op is uniform if none
-// of its operands/results has a distribution layout attribute.
-// Non-uniform vectors are handled by dedicated patterns.
-// This pattern must have a higher priority than vector dialect distribution
-// patterns, because a distributable shape may be logically intended as
-// uniform (i.e., no layout), so we want to omit its distribution.
-struct SinkUniformOps final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    // Take the last op
-    Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
-    // Any ops with nested regions must be handled carefully in dedicated
-    // patterns.
-    if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
-      return failure();
-    int operandIdx = -1;
-    if (warpRegionPreYieldOp->getNumResults()) {
-      OpOperand *operand = getWarpResult(
-          warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
-      if (!operand)
-        return failure();
-      operandIdx = operand->getOperandNumber();
-      if (warpRegionPreYieldOp->getResult(0).getType() !=
-          warpOp.getResult(operandIdx).getType())
-        return rewriter.notifyMatchFailure(warpOp,
-                                           "The op result is not uniform.");
-    }
-
-    // The op must have no layout-based operands or results.
-    bool uniformValuesOnly =
-        llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
-          return !xegpu::getDistributeLayoutAttr(v);
-        });
-    uniformValuesOnly &=
-        llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
-          return !xegpu::getDistributeLayoutAttr(opr);
-        });
-    if (!uniformValuesOnly)
-      return rewriter.notifyMatchFailure(warpOp,
-                                         "Some values are not uniform.");
-    SmallVector<size_t> newRetIndices;
-    SmallVector<Value> operands =
-        llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
-    SmallVector<Type> operandTypes =
-        llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, operands, operandTypes, newRetIndices);
-
-    rewriter.setInsertionPointAfter(newWarpOp);
-    IRMapping operandMapper;
-    for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
-      operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
-                        newWarpOp->getResult(newOperandIdx));
-    Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
-    if (!clonedOp->getNumResults())
-      rewriter.eraseOp(warpRegionPreYieldOp);
-    else {
-      assert(operandIdx != -1 && "Expected a warp result for the operation");
-      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
-                                  clonedOp->getResult(0));
-    }
-    return success();
-  }
-};
-
-/// This patterns distribute the `vector.multi_reduction` operation across
-/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
-/// layouts for the source and accumulator vectors,
-/// * If the reduction dimension is distributed across lanes, the reduction is
-///   non-lane-local and the reduction is done using warp shuffles. Here we
-///   simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
-///   the warp op body.
-/// * If the reduction dimension is not distributed across lanes, the reduction
-///   is lane-local. In this case, we yield the source and accumulator vectors
-///   from the warp op and perform the lane-local reduction outside the warp op
-///   using a sequence of ReductionOps.
-/// Example 1 (Reduction is lane-local):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
-///   %0 = "some_def"() : () -> (vector<16x32xf32>)
-///   %acc = "some_def"() : () -> (vector<32xf32>)
-///   %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
-///   vector<32xf32> gpu.yield %1 : vector<32xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
-/// vector<1xf32>) {
-///   %0 = "some_def"() : () -> (vector<16x32xf32>)
-///   %acc = "some_def"() : () -> (vector<32xf32>)
-///   gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
-/// }
-/// %c = arith.constant dense<0.0> : vector<1xf32>
-/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
-/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
-/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
-/// ```
-/// Example 2 (Reduction is non-lane-local):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
-///   %0 = "some_def"() : () -> (vector<2x32xf32>)
-///   %acc = "some_def"() : () -> (vector<2xf32>)
-///   %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
-///   vector<2xf32>
-///   gpu.yield %1 : vector<2xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
-///   %0 = "some_def"() : () -> (vector<2x32xf32>)
-///   %acc = "some_def"() : () -> (vector<2xf32>)
-///   %1 = arith.constant dense<0.0> : vector<2xf32>
-///   %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
-///   %3 = ("warp.reduction %2") : f32
-///   %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
-///   ... repeat for row 1
-///   gpu.yield %1 : vector<2xf32>
-/// }
-struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *yieldOperand =
-        getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
-    if (!yieldOperand)
-      return failure();
-    auto reductionOp =
-        cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
-    unsigned operandIdx = yieldOperand->getOperandNumber();
-    VectorType sourceType = reductionOp.getSourceVectorType();
-    int64_t sourceRank = sourceType.getRank();
-    // Need at least a 2D source vector.
-    if (sourceRank < 2)
-      return rewriter.notifyMatchFailure(warpOp,
-                                         "Only 2D+ reductions are supported.");
-    // Leading dimensions (first rank-2) must be unit (size 1).
-    for (int64_t i = 0; i < sourceRank - 2; ++i) {
-      if (sourceType.getShape()[i] != 1)
-        return rewriter.notifyMatchFailure(
-            warpOp, "Only unit dimensions allowed for the leading dimensions.");
-    }
-    // Effective dimension indices (last 2 dims of the source).
-    int64_t rowIdx = sourceRank - 2;
-    int64_t columnIdx = sourceRank - 1;
-    ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
-    if (reductionDims.size() != 1)
-      return rewriter.notifyMatchFailure(warpOp,
-                                         "Only 1 reduction dim is supported.");
-    int64_t reductionDim = reductionDims[0];
-    // The reduction dim must be among the last 2 dims.
-    if (reductionDim != rowIdx && reductionDim != columnIdx)
-      return rewriter.notifyMatchFailure(
-          warpOp, "Reduction dim must be among the last 2 dimensions.");
-    VectorType distributedResultType =
-        cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    VectorType resultType = cast<VectorType>(reductionOp.getType());
-    xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
-
-    FailureOr<VectorType> sourceDistTypeOrFailure =
-        getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
-    if (failed(sourceDistTypeOrFailure))
-      return rewriter.notifyMatchFailure(
-          warpOp, "Failed to distribute the source vector type.");
-    VectorType sourceDistType = sourceDistTypeOrFailure.value();
-    // Only single dimension distribution among the last 2 dims is supported.
-    bool rowDistributed =
-        sourceDistType.getShape()[rowIdx] != sourceType.getShape()[rowIdx];
-    bool columnDistributed = sourceDistType.getShape()[columnIdx] !=
-                             sourceType.getShape()[columnIdx];
-    if (rowDistributed && columnDistributed)
-      return rewriter.notifyMatchFailure(
-          warpOp, "Expecting source to be distributed in a single dimension.");
-    int64_t sourceDistDim =
-        rowDistributed ? rowIdx : (columnDistributed ? columnIdx : -1);
-    if (sourceDistDim == -1)
-      return rewriter.notifyMatchFailure(
-          warpOp, "Expecting a distributed source vector.");
-    bool resultDistributed =
-        distributedResultType.getNumElements() < resultType.getNumElements();
-    // If the lane owns all the data required for reduction (i.e. reduction is
-    // fully parallel accross lanes), then each lane owns part of the result
-    // (i.e. result is distributed). If the reduction require cross-lane
-    // shuffling, then the result is shared among all lanes (broadcasted).
-    // Therefore we expect following cases:
-    //
-    // | Source vector        | Reduction dim  | Result vector  |
-    // |----------------------|----------------|----------------|
-    // |  dim-0 distributed   |       0        | broadcasted    |
-    // |  dim-0 distributed   |       1        | distributed    |
-    // |  dim-1 distributed   |       0        | distributed    |
-    // |  dim-1 distributed   |       1        | broadcasted    |
-
-    bool isReductionLaneLocal =
-        (sourceDistDim == rowIdx && reductionDim == columnIdx) ||
-        (sourceDistDim == columnIdx && reductionDim == rowIdx);
-    if (isReductionLaneLocal && !resultDistributed)
-      return rewriter.notifyMatchFailure(
-          warpOp, "Expecting a distributed result for lane-local reduction.");
-
-    if (!isReductionLaneLocal && resultDistributed)
-      return rewriter.notifyMatchFailure(
-          warpOp,
-          "Expecting a broadcasted result for non-lane-local reduction.");
-
-    // Handle lane-local reduction case. In this case we fully distribute the
-    // reduction result.
-    if (isReductionLaneLocal) {
-      // Yield the source and acc vectors from the WarpOp.
-      SmallVector<size_t> newRetIndices;
-      auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-          rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
-          {sourceDistType, distributedResultType}, newRetIndices);
-      rewriter.setInsertionPointAfter(newWarpOp);
-      Value result = xegpu::lowerToVectorReductions(
-          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
-          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
-          reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
-      // Replace the warp op result with the final result.
-      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
-      return success();
-    }
-    // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
-    // of multiple ReductionOps. Actual distribution is done by the
-    // WarpOpReduction pattern.
-    rewriter.setInsertionPointAfter(reductionOp);
-    Value result = xegpu::lowerToVectorReductions(
-        cast<TypedValue<VectorType>>(reductionOp.getSource()),
-        cast<TypedValue<VectorType>>(reductionOp.getAcc()),
-        reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
-    // Replace the warp op result with the final result.
-    rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
-    return success();
-  }
-};
-
-/// This pattern distributes the `vector.broadcast` operation across lanes in a
-/// warp. The pattern supports three use cases:
-///
-/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
-/// vector
-///    must have a slice layout of the result. If the distributed source and
-///    target vector types are identical, this lowers to a no-op; otherwise, it
-///    remains a broadcast but operates on distributed vectors.
-///
-/// 2) Broadcast a same-rank vector with identical layouts for source and
-/// target:
-///    The source vector must have unit dimensions, and lane_data must be unit
-///    size for those unit dims. This always lowers to a no-op.
-///
-/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
-///    scalar to distributed result type.
-///
-/// Example 1 (lowering to a broadcast with distributed types):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
-///   %0 = "some_def"() {layout_result_0 =
-///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-///     dims = [0]> } : () -> (vector<32xf32>)
-///   %2 = vector.broadcast %0 {layout_result_0 =
-///     #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
-///     : vector<32xf32> to vector<8x32xf32>
-///     gpu.yield %1 : vector<8x32xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
-///   %0 = "some_def"() {layout_result_0 =
-///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-///     dims = [0]> } : () -> (vector<32xf32>)
-///   gpu.yield %0 : vector<32xf32>
-/// }
-/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
-///
-/// Example 2 (no-op):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
-///   %0 = "some_def"() {layout_result_0 =
-///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-///     dims = [1]> } : () -> (vector<8xf32>)
-///   %1 = vector.shape_cast %0
-///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
-///      1]>}: vector<8xf32> to vector<8x1xf32>
-///   %2 = vector.broadcast %1
-///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
-///     1]>}: vector<8x1xf32> to vector<8x32xf32>
-///   gpu.yield %1 : vector<8x32xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
-///   %0 = "some_def"() {layout_result_0 =
-///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-///     dims = [1]> } : () -> (vector<8xf32>)
-///   %1 = vector.shape_cast %0
-///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
-///     1]>}: vector<8xf32> to vector<8x1xf32>
-///   gpu.yield %1 : vector<8x1xf32>
-/// }
-/// // The broadcast is implicit through layout transformation (no-op)
-///  "some_use"(%r#0)
-/// ```
-struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *yieldOperand =
-        getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
-    if (!yieldOperand)
-      return failure();
-    auto broadcastOp =
-        cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
-    unsigned operandIdx = yieldOperand->getOperandNumber();
-
-    VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
-    VectorType destType =
-        dyn_cast<VectorType>(broadcastOp.getResult().getType());
-
-    xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
-    xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));
-
-    FailureOr<VectorType> sourceDistType;
-    Type sourceElemOrDistType;
-    if (sourceType) {
-
-      // Case 1 and 2: source is a vector type.
-      int64_t rankDiff = destType.getRank() - sourceType.getRank();
-      if (rankDiff > 0) {
-        // Case 1: source is lower-rank than result.
-        bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
-        if (!isSliceOf)
-          broadcastOp.emitWarning()
-              << "Broadcast input layout must be a slice of result layout.";
-      }
-      // case 2: source and result have same rank
-      if (rankDiff == 0) {
-        auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
-        SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
-                                               broadcastUnitDimsSet.end());
-        assert(sourceLayout.isEqualTo(
-                   sourceLayout.setUnitDimData(broadcastUnitDims)) &&
-               "The sg_data for unit dimensions should be set as 1");
-        sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
-      }
-
-      sourceDistType =
-          getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
-      if (failed(sourceDistType)) {
-        return rewriter.notifyMatchFailure(
-            warpOp, "Failed to distribute the source vector type.");
-      }
-      sourceElemOrDistType = sourceDistType.value();
-
-    } else {
-      // Case 3: source is a scalar type.
-      if (sourceLayout) {
-        return rewriter.notifyMatchFailure(
-            warpOp, "Broadcast from scalar must not have a layout attribute.");
-      }
-      sourceElemOrDistType = broadcastOp.getSourceType();
-    }
-    FailureOr<VectorType> destDistType =
-        getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
-    if (failed(destDistType)) {
-      return rewriter.notifyMatchFailure(
-          warpOp, "Failed to distribute the dest vector type.");
-    }
-
-    SmallVector<size_t> newRetIndices;
-    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
-        newRetIndices);
-
-    Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
-
-    Value newBroadcast = distributedSource;
-
-    if (sourceElemOrDistType != destDistType.value()) {
-      rewriter.setInsertionPointAfter(newWarpOp);
-      newBroadcast =
-          vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
-                                      destDistType.value(), distributedSource);
-    }
-
-    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
-    return success();
-  }
-};
-
-/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region.
-struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *yieldOperand =
-        getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
-    if (!yieldOperand)
-      return failure();
-    auto shapeCastOp =
-        cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
-    unsigned operandNumber = yieldOperand->getOperandNumber();
-    auto resultDistTy =
-        cast<VectorType>(warpOp.getResult(operandNumber).getType());
-    xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
-    xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
-    if (!sourceLayout || !resultLayout)
-      return rewriter.notifyMatchFailure(
-          warpOp,
-          "the source or result of shape_cast op lacks distribution layout");
-
-    FailureOr<VectorType> sourceDistTypeOrFailure =
-        getDistVecTypeBasedOnLaneLayout(sourceLayout,
-                                        shapeCastOp.getSourceVectorType());
-    if (failed(sourceDistTypeOrFailure))
-      return rewriter.notifyMatchFailure(
-          warpOp, "failed to get distributed vector type for source");
-    VectorType sourceDistType = sourceDistTypeOrFailure.value();
-    // Create a new warp op that yields the source of the shape_cast op.
-    SmallVector<size_t> newRetIndices;
-    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
-        newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-    Value source = newWarpOp.getResult(newRetIndices[0]);
-    // Create a new shape_cast op outside the warp op.
-    Value newShapeCast = vector::ShapeCastOp::create(
-        rewriter, shapeCastOp.getLoc(), resultDistTy, source);
-    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
-                                newShapeCast);
-    return success();
-  }
-};
-
-// Distribute a `vector.extract_strided_slice` op feeding into yield op of an
-// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
-// advanced cases where the distributed dimension is partially extracted and
-// currently not supported by the generic vector distribution patterns.
-struct VectorExtractStridedSliceDistribution
-    : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
-    if (!operand)
-      return failure();
-    auto extractOp =
-        cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
-    unsigned operandIdx = operand->getOperandNumber();
-    auto distributedType =
-        cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    // Find the distributed dimensions.
-    auto extractResultType = cast<VectorType>(operand->get().getType());
-    auto distributedDims =
-        getDistributedDims(extractResultType, distributedType);
-    // Collect updated source type, sizes and offsets. They may be adjusted
-    // later if the data is distributed to lanes (as opposed to being owned by
-    // all lanes uniformly).
-    VectorType updatedSourceType = extractOp.getSourceVectorType();
-    SmallVector<Attribute> updatedSizes = llvm::map_to_vector(
-        extractOp.getSizes(), [](Attribute attr) { return attr; });
-    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
-        extractOp.getOffsets(), [](Attribute attr) { return attr; });
-    SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
-        extractOp.getStrides(), [](Attribute attr) { return attr; });
-    // If the provided sizes, offsets, strides are less than the rank, pad them
-    // with full sizes, zero offsets, and unit strides. This makes it easier to
-    // adjust them later.
-    int64_t sourceRank = extractOp.getSourceVectorType().getRank();
-    for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) {
-      updatedSizes.push_back(rewriter.getI64IntegerAttr(
-          extractOp.getSourceVectorType().getDimSize(i)));
-      updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
-      updatedStrides.push_back(
-          rewriter.getI64IntegerAttr(1)); // stride is always 1.
-    }
-    // If the result is distributed, it must be distributed in exactly one
-    // dimension. In this case, we adjust the sourceDistType, distributedSizes
-    // and distributedOffsets accordingly.
-    if (distributedDims.size() > 0) {
-      if (distributedDims.size() != 1)
-        return rewriter.notifyMatchFailure(
-            warpOp, "Source can not be distributed in multiple dimensions.");
-      int64_t distributedDim = distributedDims[0];
-      int sourceDistrDimSize =
-          extractOp.getSourceVectorType().getShape()[distributedDim];
-      auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
-      if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
-        return rewriter.notifyMatchFailure(
-            warpOp, "the source of extract_strided_slice op lacks distribution "
-                    "layout");
-      auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
-      // Because only single dimension distribution is supported, lane layout
-      // size at the distributed dim must be the subgroup size.
-      int subgroupSize = sourceLaneLayout[distributedDim];
-      // Check if the source size in the distributed dimension is a multiple of
-      // subgroup size.
-      if (sourceDistrDimSize % subgroupSize != 0)
-        return rewriter.notifyMatchFailure(
-            warpOp,
-            "Source size along distributed dimension is not a multiple of "
-            "subgroup size.");
-      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
-      // We expect lane data to be all ones in this case.
-      if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
-        return rewriter.notifyMatchFailure(
-            warpOp, "Expecting unit lane data in source layout");
-      // The offsets in the distributed dimention must be a multiple of subgroup
-      // size.
-      int64_t distrDimOffset =
-          cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt();
-      if (distrDimOffset % subgroupSize != 0)
-        return rewriter.notifyMatchFailure(
-            warpOp, "Offset along distributed dimension "
-                    "is not a multiple of subgroup size.");
-      updatedSourceType = getDistVecTypeBasedOnLaneLayout(
-                              sourceLayout, extractOp.getSourceVectorType())
-                              .value();
-      // Update the distributed sizes to match the distributed type.
-      updatedSizes[distributedDim] = rewriter.getI64IntegerAttr(
-          distributedType.getDimSize(distributedDim));
-      // Update the distributed offsets to match round robin distribution (i.e.
-      // each lane owns data at `subgroupSize` stride given unit lane data).
-      updatedOffsets[distributedDim] =
-          rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
-    }
-    // Do the distribution by yielding the source of the extract op from
-    // the warp op and creating a new extract op outside the warp op.
-    SmallVector<size_t> newRetIndices;
-    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType},
-        newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-    Value source = newWarpOp.getResult(newRetIndices[0]);
-    // Create a new extract op outside the warp op.
-    Value newExtractOp = vector::ExtractStridedSliceOp::create(
-        rewriter, extractOp.getLoc(), distributedType, source,
-        ArrayAttr::get(rewriter.getContext(), updatedOffsets),
-        ArrayAttr::get(rewriter.getContext(), updatedSizes),
-        ArrayAttr::get(rewriter.getContext(), updatedStrides));
-    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
-    return success();
-  }
-};
-
-/// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
-/// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
-/// advanced cases where the distributed dimension is partially inserted and
-/// currently not supported by the generic vector distribution patterns.
-struct VectorInsertStridedSliceDistribution
-    : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
-      // Check if the InsertStridedSliceOp is the last op before yield op
-      return llvm::IsaPred<vector::InsertStridedSliceOp>(op) &&
-             warpOp.getTerminator()->getPrevNode() == op;
-    });
-    if (!operand)
-      return failure();
-    unsigned int operandNumber = operand->getOperandNumber();
-    auto insertOp =
-        operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
-    auto distributedType =
-        cast<VectorType>(warpOp.getResult(operandNumber).getType());
-    // Find the distributed dimensions of the dest vector.
-    auto insertResultType = cast<VectorType>(operand->get().getType());
-    auto destDistributedDims =
-        getDistributedDims(insertResultType, distributedType);
-    // Collect updated offsets, source type and dest type. They may be adjusted
-    // later if the data is distributed to lanes (as opposed to being owned by
-    // all lanes uniformly).
-    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
-        insertOp.getOffsets(), [](Attribute attr) { return attr; });
-    VectorType updatedSourceType = insertOp.getSourceVectorType();
-    VectorType updatedDestType = insertOp.getDestVectorType();
-    if (destDistributedDims.size() > 0) {
-      // Only single dimension distribution is supported.
-      if (destDistributedDims.size() != 1)
-        return rewriter.notifyMatchFailure(
-            warpOp,
-            "Expecting source to be distributed in a single dimension.");
-      int64_t destDistributedDim = destDistributedDims[0];
-
-      VectorType srcType = insertOp.getSourceVectorType();
-      VectorType destType = insertOp.getDestVectorType();
-      // Currently we require that both source (kD) and dest (nD) vectors are
-      // distributed. This requires that distributedDim (d) is contained in the
-      // last k dims of the dest vector (d >= n - k).
-      int64_t sourceDistributedDim =
-          destDistributedDim - (destType.getRank() - srcType.getRank());
-      if (sourceDistributedDim < 0)
-        return rewriter.notifyMatchFailure(
-            insertOp,
-            "distributed dimension must be in the last k (i.e. source "
-            "rank) dims of dest vector");
-      int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
-      // Obtain the source and dest layouts.
-      auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
-      auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
-      if (!destLayout || !sourceLayout ||
-          destLayout.getEffectiveLaneLayoutAsInt().empty() ||
-          sourceLayout.getEffectiveLaneLayoutAsInt().empty())
-        return rewriter.notifyMatchFailure(
-            warpOp, "the source or dest of insert_strided_slice op lacks "
-                    "distribution layout");
-      // Because only single dimension distribution is supported, lane layout
-      // size at the distributed dim must be the subgroup size.
-      int subgroupSize =
-          destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
-      // We require that source and dest lane data are all ones to ensure
-      // uniform round robin distribution.
-      auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
-      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
-      if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
-          !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
-        return rewriter.notifyMatchFailure(
-            warpOp, "Expecting unit lane data in source and dest layouts");
-      // Source distributed dim size must be multiples of subgroup size.
-      if (srcDistrDimSize % subgroupSize != 0)
-        return rewriter.notifyMatchFailure(
-            warpOp, "Distributed dimension size in source is not a multiple of "
-                    "subgroup size.");
-      // Offsets in the distributed dimension must be multiples of subgroup
-      // size.
-      int64_t destDistrDimOffset =
-          cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
-      if (destDistrDimOffset % subgroupSize != 0)
-        return rewriter.notifyMatchFailure(
-            warpOp,
-            "Offset along distributed dimension in dest is not a multiple of "
-            "subgroup size.");
-      // Update the source and dest types based on their layouts.
-      updatedSourceType = getDistVecTypeBasedOnLaneLayout(
-                              sourceLayout, insertOp.getSourceVectorType())
-                              .value();
-      updatedDestType = getDistVecTypeBasedOnLaneLayout(
-                            destLayout, insertOp.getDestVectorType())
-                            .value();
-      // Update the distributed offsets to match round robin distribution (i.e.
-      // each lane owns data at `subgroupSize` stride given unit lane data).
-      updatedOffsets[destDistributedDim] =
-          rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
-    }
-    // Do the distribution by yielding the source and dest of the insert op
-    // from the warp op and creating a new insert op outside the warp op.
-    SmallVector<size_t> newRetIndices;
-    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
-        {updatedSourceType, updatedDestType}, newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-
-    Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
-    Value dest = newWarpOp.getResult(newRetIndices[1]);
-    // Create a new insert op outside the warp op.
-    Value newInsertOp = vector::InsertStridedSliceOp::create(
-        rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest,
-        ArrayAttr::get(rewriter.getContext(), updatedOffsets),
-        insertOp.getStrides());
-    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
-                                newInsertOp);
-    return success();
-  }
-};
-
-/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
-/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
-/// outside of the warp op.
-struct MemrefExtractAlignedPointerAsIndexDistribution final
-    : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand = getWarpResult(
-        warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(
-          warpOp,
-          "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
-    auto extractOp =
-        operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, extractOp.getSource(),
-        TypeRange{extractOp.getSource().getType()}, newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-    auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
-        rewriter, newWarpOp.getLoc(), extractOp.getType(),
-        newWarpOp.getResult(newRetIndices[0]));
-    Value resultVal = newWarpOp.getResult(operandIdx);
-    rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult());
-    return success();
-  }
-};
-
-/// Distribute a vector::BitCastOp feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
-/// diemension of the source/result vectors. Equivalent vector::BitCastOp is
-/// created outside of the warp op with distributed source vector type (computed
-/// using assigned layout).
-struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(
-          warpOp, "warp result is not a vector::BitCast op");
-    auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-    VectorType distributedSourceType =
-        getDistVecTypeBasedOnLaneLayout(
-            xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
-            bitcastOp.getSourceVectorType())
-            .value_or(VectorType());
-    if (!distributedSourceType)
-      return rewriter.notifyMatchFailure(
-          bitcastOp, "Failed to distribute the source vector type in "
-                     "vector::BitCast op");
-    VectorType distributedResultType =
-        cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, bitcastOp.getSource(),
-        TypeRange{distributedSourceType}, newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-    auto newBitcastOp = vector::BitCastOp::create(
-        rewriter, newWarpOp.getLoc(), distributedResultType,
-        newWarpOp.getResult(newRetIndices[0]));
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
-    return success();
-  }
-};
-
-/// Distribute a vector::TransposeOp feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
-/// supported. In most cases, transpose is a no op because it is entirely
-/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
-/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
-/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
-/// vector::TransposeOp outside of the warp op with distributed source vector
-/// type (computed using assigned layout).
-struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand =
-        getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(
-          warpOp, "warp result is not a vector::Transpose op");
-    auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-    xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
-    xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
-    if (!sourceLayout || !resultLayout)
-      return rewriter.notifyMatchFailure(
-          transposeOp,
-          "the source or result vector of the transpose op lacks layout "
-          "attribute");
-    int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
-    int64_t resultRank = transposeOp.getResultVectorType().getRank();
-    // Only 2D transposes are supported for now.
-    // TODO: Support nD transposes.
-    if (sourceRank != 2 || resultRank != 2)
-      return rewriter.notifyMatchFailure(
-          transposeOp, "the source or result vector of the transpose op "
-                       "does not have 2D layout");
-    ArrayRef<int64_t> perm = transposeOp.getPermutation();
-    // Result layout must be a transpose of source layout.
-    if (!resultLayout.isTransposeOf(sourceLayout, perm,
-                                    xegpu::LayoutKind::Lane))
-      return rewriter.notifyMatchFailure(
-          transposeOp,
-          "the source or result vector layouts must be 2D transposes of each "
-          "other");
-    FailureOr<VectorType> distributedSourceTypeOrFailure =
-        getDistVecTypeBasedOnLaneLayout(sourceLayout,
-                                        transposeOp.getSourceVectorType());
-    if (failed(distributedSourceTypeOrFailure))
-      return rewriter.notifyMatchFailure(
-          transposeOp, "Failed to distribute the source vector type in "
-                       "vector::Transpose op");
-    SmallVector<size_t> newRetIndices;
-    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
-        rewriter, warpOp, transposeOp.getVector(),
-        TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
-    rewriter.setInsertionPointAfter(newWarpOp);
-    auto newTransposeOp = vector::TransposeOp::create(
-        rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
-        perm);
-    Value distributedVal = newWarpOp.getResult(operandIdx);
-    rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
-    return success();
-  }
-};
-
-/// Distribute a vector::StepOp with the sliced result layout.
-/// The sliced layout must have exactly 1 effective lane dimension.
-/// We completely resolve the vector::StepOp by computing the lane_data-sized
-/// subranges.
-struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern {
-  using gpu::WarpDistributionPattern::WarpDistributionPattern;
-  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
-                                PatternRewriter &rewriter) const override {
-    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::StepOp>);
-    if (!operand)
-      return rewriter.notifyMatchFailure(
-          warpOp, "warp result is not a vector::StepOp op");
-    auto stepOp = operand->get().getDefiningOp<vector::StepOp>();
-    unsigned operandIdx = operand->getOperandNumber();
-    xegpu::DistributeLayoutAttr resultLayout =
-        xegpu::getTemporaryLayout(stepOp->getResult(0));
-    if (!resultLayout)
-      return rewriter.notifyMatchFailure(
-          stepOp, "the result vector of the step op lacks layout "
-                  "attribute");
-    auto sliceLayout = dyn_cast<xegpu::SliceAttr>(resultLayout);
-    if (!sliceLayout)
-      return rewriter.notifyMatchFailure(
-          stepOp, "the result layout must be a slice layout");
-    if (sliceLayout.getEffectiveLaneLayoutAsInt().size() != 1)
-      return rewriter.notifyMatchFailure(
-          stepOp, "expecting 1 dim in the effective result layout");
-
-    rewriter.setInsertionPointAfter(warpOp);
-    auto loc = stepOp.getLoc();
-    auto stepResultVecTy = stepOp.getResult().getType();
-    Value distributedVal = warpOp.getResult(operandIdx);
-    VectorType newVecTy = cast<VectorType>(distributedVal.getType());
-
-    auto laneDataBlockCoords = resultLayout.computeDistributedCoords(
-        rewriter, loc, warpOp.getLaneid(), stepResultVecTy.getShape());
-    if (failed(laneDataBlockCoords))
-      return rewriter.notifyMatchFailure(
-          stepOp, "failed to compute lane data block coordinates");
-
-    auto laneDataBlockCoordsVec = laneDataBlockCoords.value();
-    auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0];
-    assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) ==
-           newVecTy.getNumElements() / laneDataBlockLength);
-    SmallVector<Value> stepVals;
-    // For each lane_data block, reconstruct its sub-range
-    // from the range of SG-level vector.step. Example: vector.step
-    // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} :
-    // vector<16xindex>
-    // Each logical lane holds 4 elements as 2 blocks of 2 elements each.
-    // The blocks are round-robin distributed, so logical lane id 0
-    // holds values [0,1, 8,9].
-    for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) {
-      auto laneDataBlockStartCoord = laneDataBlockCoords[0];
-      stepVals.push_back(laneDataBlockStartCoord);
-      for (int i = 1; i < laneDataBlockLength; ++i) {
-        auto offset = arith::ConstantIndexOp::create(rewriter, loc, i);
-        stepVals.push_back(arith::AddIOp::create(
-            rewriter, loc, laneDataBlockStartCoord, offset));
-      }
-    }
-    assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() &&
-           "Expecting the number of step values to match the number of "
-           "elements in the vector");
-    auto stepOpVal =
-        vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals);
-    rewriter.replaceAllUsesWith(distributedVal, stepOpVal);
-    return success();
-  }
-};
-
-struct ConvertLayoutDistribution
-    : public OpRewritePattern<xegpu::ConvertLayoutOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
-                                PatternRewriter &rewriter) const override {
-    auto inputLayout = op.getInputLayoutAttr();
-    auto targetLayout = op.getTargetLayoutAttr();
-    Type valType = op.getResult().getType();
-
-    if (!inputLayout || !targetLayout)
-      return rewriter.notifyMatchFailure(op, "missing layout attributes");
-
-    if (valType.isIntOrFloat()) {
-      rewriter.replaceOp(op, op.getSource());
-      return success();
-    }
-    auto resShape = cast<VectorType>(valType).getShape();
-    SmallVector<int64_t> resShapeVec(resShape.begin(), resShape.end());
-    if (!inputLayout.isCompatibleWith(targetLayout, resShapeVec,
-                                      xegpu::LayoutKind::Lane)) {
-      return rewriter.notifyMatchFailure(
-          op, "lowering incompatible convert_layout not yet supported");
-    }
-    rewriter.replaceOp(op, op.getSource());
-    return success();
-  }
-};
-
-} // namespace
-
-namespace {
-struct XeGPUSubgroupDistributePass final
-    : public xegpu::impl::XeGPUSubgroupDistributeBase<
-          XeGPUSubgroupDistributePass> {
-  void runOnOperation() override;
-};
-} // namespace
-
-void xegpu::populateXeGPUSubgroupDistributePatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<CreateNdDescDistribution, StoreNdDistribution,
-               LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
-               GpuBarrierDistribution, VectorMultiReductionDistribution,
-               LoadDistribution, StoreDistribution, VectorTransposeDistribution,
-               VectorBitcastDistribution, LoadMatrixDistribution,
-               StoreMatrixDistribution, ConvertLayoutDistribution,
-               MemrefExtractAlignedPointerAsIndexDistribution>(
-      patterns.getContext(),
-      /*pattern benefit=*/PatternHierarchy::Regular);
-  // For following patterns, we need to override the regular vector distribution
-  // patterns. Therefore, assign higher benefit.
-  patterns
-      .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
-           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
-           VectorStepSliceDistribution, SinkUniformOps>(
-          patterns.getContext(),
-          /*pattern benefit=*/PatternHierarchy::AboveRegular);
-}
-
-void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
-}
-
-void XeGPUSubgroupDistributePass::runOnOperation() {
-  // Step 1: Attach layouts to op operands.
-  // TODO: Following assumptions are made:
-  // 1) It is assumed that there are no layout conflicts.
-  // 2) Any existing layout attributes attached to the operands are ignored.
-  Operation *op = getOperation();
-  if (!xegpu::recoverTemporaryLayouts(op)) {
-    signalPassFailure();
-    return;
-  }
-
-  // Step 2: Move all operations of a GPU function inside
-  // gpu.warp_execute_on_lane_0 operation.
-  {
-    RewritePatternSet patterns(&getContext());
-    xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
-
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-    // At this point, we have moved the entire function body inside the
-    // warpOp. Now move any scalar uniform code outside of the warpOp (like
-    // GPU index ops, scalar constants, etc.). This will simplify the
-    // later lowering and avoid custom patterns for these ops.
-    getOperation()->walk([&](Operation *op) {
-      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
-        vector::moveScalarUniformCode(warpOp);
-    });
-  }
-  // Step 3: Apply subgroup to workitem distribution patterns.
-  RewritePatternSet patterns(&getContext());
-  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  // distributionFn is used by vector distribution patterns to determine the
-  // distributed vector type for a given vector value. In XeGPU subgroup
-  // distribution context, we compute this based on lane layout.
-  auto distributionFn = [](Value val) {
-    VectorType vecType = dyn_cast<VectorType>(val.getType());
-    int64_t vecRank = vecType ? vecType.getRank() : 0;
-    if (vecRank == 0)
-      return AffineMap::get(val.getContext());
-    // Get the layout of the vector type.
-    xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
-    // If no layout is specified, assume uniform case (no distribution).
-    if (!layout)
-      return AffineMap::get(val.getContext());
-    // Expecting vector and layout rank to match.
-    assert(layout.getRank() == vecRank &&
-           "Expecting vector and layout rank to match");
-    // A dimension is distributed only if layout suggests there are
-    // multiple lanes assigned for this dimension and the shape can be evenly
-    // distributed to those lanes.
-    SmallVector<unsigned int> distributedDims;
-    for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
-      if (v > 1 && vecType.getShape()[i] % v == 0)
-        distributedDims.push_back(i);
-    }
-    return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
-                                                val.getContext());
-  };
-  // TODO: shuffleFn is not used.
-  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
-                      int64_t warpSz) { return Value(); };
-
-  vector::populateDistributeReduction(
-      patterns, xegpu::subgroupReduction,
-      /*pattern benefit=*/PatternHierarchy::Regular);
-
-  vector::populatePropagateWarpVectorDistributionPatterns(
-      patterns, distributionFn, shuffleFn,
-      /*pattern benefit=*/PatternHierarchy::Regular);
-  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-    signalPassFailure();
-    return;
-  }
-
-  // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
-  // due to tensor desc type mismatches created by using upstream distribution
-  // patterns (scf.for). This cleanup should only be done if all the ops are
-  // distributed successfully, if some ops are still not distributed and remains
-  // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
-  // breaking the IR.
-  bool foundWarpOp = false;
-  getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
-    // Look for WarpOps that are not trivially dead.
-    if (isOpTriviallyDead(warpOp))
-      return WalkResult::advance();
-    foundWarpOp = true;
-    return WalkResult::interrupt();
-  });
-  if (foundWarpOp)
-    return;
-
-  getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
-    // We are only interested in UnrealizedConversionCastOps there were added
-    // for resolving SIMT type mismatches.
-    if (!op->getAttr(resolveSIMTTypeMismatch))
-      return WalkResult::skip();
-
-    Value input = op.getOperand(0);
-    Value output = op.getResult(0);
-
-    // Both input and output must have tensor descriptor types.
-    xegpu::TensorDescType inputDescType =
-        mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
-    xegpu::TensorDescType outputDescType =
-        mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
-    assert(inputDescType && outputDescType &&
-           "Unrealized conversion cast must have tensor descriptor types");
-
-    // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
-    // This occurs inside scf.for body to resolve the block argument type to
-    // SIMT type.
-    if (inputDescType.getLayout()) {
-      auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
-      if (argument) {
-        argument.setType(output.getType());
-        output.replaceAllUsesWith(argument);
-        if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
-                argument.getOwner()->getParentOp())) {
-          auto result = loopOp.getTiedLoopResult(argument);
-          result.setType(output.getType());
-        }
-      }
-    }
-
-    // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
-    // conversions. This occurs at the yield op of scf.for body to go back
-    // from SIMT type to original type.
-    if (outputDescType.getLayout())
-      output.replaceAllUsesWith(input);
-
-    if (op->use_empty())
-      op->erase();
-    return WalkResult::advance();
-  });
-
-  xegpu::removeTemporaryLayoutAttrs(getOperation());
-}

diff  --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 9c2d8e6aa5247..0fb0ac6e3416d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -717,8 +717,6 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
-  // TODO: Remove these get/setTemporaryLayout calls after we deprecate the old
-  // XeGPUSubgroupDistribute pass.
   auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
   auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
   // Reduction result should have the same layout as the accumulator.

diff  --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
deleted file mode 100644
index 3c2d987039840..0000000000000
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file %s | FileCheck %s
-
-gpu.module @test {
-gpu.func @empty()  {
-  gpu.return
-}
-}
-
-// CHECK-LABEL: gpu.func @empty()
-// CHECK-NEXT:      gpu.return
-
-// -----
-gpu.module @test {
-gpu.func @gemm(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-  %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-  %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-  %2 = xegpu.load_nd %0[%c0, %c0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-  %3 = xegpu.load_nd %1[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-  %4 = xegpu.dpas %2, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-  %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-  xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.return
-}
-}
-
-// CHECK-LABEL: gpu.func @gemm(
-// CHECK:         %[[ARG0:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<16x16xf16>,
-// CHECK-SAME:    %[[ARG2:[a-zA-Z0-9]+]]: memref<8x16xf32>)
-// CHECK:         %[[LANEID:.*]] = gpu.lane_id
-// CHECK-NEXT:    gpu.warp_execute_on_lane_0(%[[LANEID]])[16]
-// CHECK-SAME:      args(%[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>)
-// CHECK:           ^bb0(%[[ARG3:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG4:[a-zA-Z0-9]+]]: memref<16x16xf16>,
-// CHECK-SAME:      %[[ARG5:[a-zA-Z0-9]+]]: memref<8x16xf32>):
-// CHECK-NEXT:      %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG3]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT:      %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG4]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT:      %[[T3:.*]] = xegpu.load_nd %[[T1]][{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT:      %[[T4:.*]] = xegpu.load_nd %[[T2]][{{.*}}]  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT:      %[[T5:.*]] = xegpu.dpas %[[T3]], %[[T4]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT:      %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG5]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT:      xegpu.store_nd %[[T5]], %[[T6]][%{{.*}}]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-// CHECK:         gpu.return
-
-// -----
-gpu.module @test {
-gpu.func @already_in_warp_op() {
-  %laneid = gpu.lane_id
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    "test.unknown"() : () -> ()
-    gpu.yield
-  }
-  gpu.return
-}
-}
-
-// CHECK-LABEL: gpu.func @already_in_warp_op()
-// CHECK:         %[[LANEID:.*]] = gpu.lane_id
-// CHECK:         gpu.warp_execute_on_lane_0(%[[LANEID]])[16]
-// CHECK:           "test.unknown"() : () -> ()
-// CHECK:         gpu.return
-
-// -----
-gpu.module @test {
-"gpu.func"() ({
-^bb0:
-  "test.unknown"() : () -> ()
-}) {function_type = () -> (), kernel, sym_name = "missing_return_terminator"} : () -> ()
-}
-
-// Regression test for MoveFuncBodyToWarpOp on malformed generic gpu.func.
-// CHECK-LABEL: gpu.func @missing_return_terminator
-// CHECK-NEXT:    "test.unknown"() : () -> ()
-
-// -----
-
-gpu.module @test {
-  gpu.func @multiple_blocks(%cond: i1) {
-    cf.cond_br %cond, ^bb1, ^bb2
-  ^bb1:  // pred: ^bb0
-    "test.unknown"() : () -> ()
-    cf.br ^bb2
-  ^bb2:  // 2 preds: ^bb0, ^bb1
-    gpu.return
-  }
-}
-
-// CHECK-LABEL: gpu.func @multiple_blocks
-// CHECK-SAME:  %[[COND:.*]]: i1
-// CHECK-NEXT:  cf.cond_br %[[COND]], ^bb1, ^bb2
-// CHECK:       ^bb1:
-// CHECK-NEXT:    "test.unknown"() : () -> ()
-// CHECK-NEXT:    cf.br ^bb2
-// CHECK:       ^bb2:
-// CHECK-NEXT:    gpu.return

diff  --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
similarity index 99%
rename from mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
rename to mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
index f1e56f4493ec7..7f52391bd7928 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
@@ -1,6 +1,6 @@
 
 // RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=cri' --allow-unregistered-dialect \
-// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+// RUN: --test-xegpu-sg-to-lane-distribute --split-input-file %s | FileCheck %s
 
 
 gpu.module @xevm_module {

diff  --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
similarity index 99%
rename from mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
rename to mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
index c8a9530641951..01f0c1e3e950e 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --allow-unregistered-dialect --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: --xegpu-sg-to-wi-distribute-experimental --split-input-file %s --canonicalize --cse | FileCheck %s
+// RUN: --xegpu-sg-to-lane-distribute --split-input-file %s --canonicalize --cse | FileCheck %s
 
 // CHECK-LABEL: gpu.func @gemm
 // CHECK-DAG  : %[[C0:.*]] = arith.constant 0 : index

diff  --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
deleted file mode 100644
index 8ab627a95e0a1..0000000000000
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ /dev/null
@@ -1,1271 +0,0 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute  \
-// RUN: -allow-unregistered-dialect -canonicalize -cse  %s | FileCheck %s
-gpu.module @xevm_module{
-// CHECK-LABEL: gpu.func @store_nd_1d
-// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
-// CHECK-SAME:      -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
-// CHECK:           gpu.yield %{{.*}} : vector<16xf32>,
-// CHECK-SAME:        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
-// CHECK-NEXT:    xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.func @store_nd_1d(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %cst = "some_op"() : () -> vector<16xf32>
-    xegpu.store_nd %cst, %0 [%c0] {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-      : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  }
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @store_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
-// CHECK-SAME:    -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16>
-// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT:  xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.func @store_nd_2d(%laneid : index) {
-  %c0 = arith.constant 0 : index
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %cst = "some_op"() : () -> vector<16x16xf16>
-    xegpu.store_nd %cst, %0 [%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  }
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @load_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>,
-// CHECK-SAME:    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
-// CHECK:         gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
-// CHECK-NEXT:  xegpu.load_nd %[[T1]][%[[W]]#2]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-gpu.func @load_nd_1d(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
-    %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0 [%c0]  {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
-      !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
-    gpu.yield %1 : vector<16xf32>
-  }
-  "some_user_op"(%r) : (vector<1xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @load_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK:       vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16>
-gpu.func @load_nd_2d(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
-    %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    gpu.yield %1 : vector<16x16xf16>
-  }
-  "some_user_op"(%r) : (vector<16x1xf16>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @load_nd_array_length
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<32x1xf16>,
-// CHECK-SAME:    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK:         gpu.yield %{{.*}} : vector<32x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<
-// CHECK-SAME:      array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:      #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16],
-// CHECK-SAME:      lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:    #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
-// CHECK-NEXT:  vector.shape_cast %[[T2]] : vector<32xf16> to vector<32x1xf16>
-gpu.func @load_nd_array_length(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<32x1xf16>) {
-    %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
-      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
-        #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x16xf16>
-    gpu.yield %1 : vector<32x16xf16>
-  }
-  "some_user_op"(%r) : (vector<32x1xf16>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @dpas
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] ->
-// CHECK-SAME:    (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
-// CHECK:         gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
-// CHECK-NEXT:  }
-// CHECK-DAG:   %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16>
-// CHECK-DAG:   %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16>
-// CHECK-DAG:   %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT:  %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT:  vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-gpu.func @dpas(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
-    %0 = "some_op"() : () -> vector<8x16xf16>
-    %1 = "some_op"() : () -> vector<16x16xf16>
-    %2 = "some_op"() : () -> vector<8x16xf32>
-    %3 = xegpu.dpas %0, %1, %2
-      {
-        layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-        layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-    gpu.yield %3 : vector<8x16xf32>
-  }
-  "some_user_op"(%r) : (vector<8x1xf32>) -> ()
-  gpu.return
-}
-
-
-
-// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) {
-// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT:  builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch}
-gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
-    %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
-      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  }
-  "some_user_op"(%r)
-    : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @prefetch_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-SAME:      , index, index
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2]
-// CHECK-SAME:    <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.func @prefetch_2d(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %0 = "some_op"() : ()
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.prefetch_nd %0[%c0, %c0]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  }
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @prefetch_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16,
-// CHECK-SAME:     #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
-// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16,
-// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>,
-// CHECK-SAME:    l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.func @prefetch_1d(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %0 = "some_op"() : ()
-      -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.prefetch_nd %0[%c0]
-      {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
-      : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  }
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
-// CHECK:  gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK:     gpu.yield %{{.*}}
-// CHECK:  }
-// CHECK:  %{{.*}} = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
-// CHECK:  gpu.barrier
-gpu.func @gpu_barrier(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
-    %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0[%c0]
-      {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
-      : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
-    gpu.barrier
-    gpu.yield %1 : vector<16xf16>
-  }
-  "some_user_op"(%r) : (vector<1xf16>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
-// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
-// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:    -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) {
-// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32>
-// CHECK:         gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32>
-// CHECK-NEXT:  }
-// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
-// CHECK:       %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
-// CHECK:       %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:    {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
-// CHECK:       %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
-// CHECK:       %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32>
-gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<16x32xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
-      dense<0.0>  : vector<32xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-    {
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
-      layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
-    }  [0]
-    : vector<16x32xf32> to vector<32xf32>
-    gpu.yield %1 : vector<32xf32>
-  }
-  "some_user_op"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
-// CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:   %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:   %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<2x16xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<2xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
-      }
-      [1] : vector<2x16xf32> to vector<2xf32>
-    gpu.yield %1 : vector<2xf32>
-  }
-  "some_user_op"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-
-
-// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
-// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
-// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
-// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32>
-// CHECK:         gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32>
-// CHECK:       }
-// CHECK:       %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
-// CHECK:       %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
-// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32
-// CHECK:       %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK:       %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
-// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32
-// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
-gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      : () -> (vector<32x16xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<32xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
-      }
-      [1] : vector<32x16xf32> to vector<32xf32>
-    gpu.yield %1 : vector<32xf32>
-  }
-  "some_user_op"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:       %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK:       %[[SRC:.*]] = "some_def"()
-// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-// CHECK-SAME:    : () -> vector<16x2xf32>
-// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]]
-// CHECK-SAME:    {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME:    : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
-// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME:    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]]
-// CHECK-SAME:    {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME:     layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME:    : vector<16x1xf32> to vector<16xf32>
-// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
-gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-      : () -> (vector<16x2xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-      dense<0.0>  : vector<2xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
-      }
-      [0] : vector<16x2xf32> to vector<2xf32>
-    gpu.yield %1 : vector<2xf32>
-  }
-  "some_user_op"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_3d_leading_unit_dim
-// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<1x32xf32>
-// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:    -> (vector<1x2xf32>, vector<1x16x2xf32>, vector<1x2xf32>) {
-// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<1x16x32xf32>
-// CHECK:         gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<1x32xf32>, vector<1x16x32xf32>, vector<1x32xf32>
-// CHECK-NEXT:  }
-// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:    {offsets = [0, 0, 0], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32>
-// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<1x16x1xf32> to vector<16xf32>
-// CHECK:       %[[T3:.*]] = vector.extract %[[W]]#2[0, 0] : f32 from vector<1x2xf32>
-// CHECK:       %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
-// CHECK:       %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:    {offsets = [0, 0, 1], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32>
-// CHECK:       %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<1x16x1xf32> to vector<16xf32>
-// CHECK:       %[[T7:.*]] = vector.extract %[[W]]#2[0, 1] : f32 from vector<1x2xf32>
-// CHECK:       %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
-// CHECK:       %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<1x2xf32>
-gpu.func @vector_multi_reduction_3d_leading_unit_dim(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x2xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
-      : () -> (vector<1x16x32xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>}
-      dense<0.0>  : vector<1x32xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>,
-        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>
-      }
-      [1] : vector<1x16x32xf32> to vector<1x32xf32>
-    gpu.yield %1 : vector<1x32xf32>
-  }
-  "some_user_op"(%r) : (vector<1x2xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_3d_trivial_reduction
-// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:    -> (vector<1x1xf32>, vector<1x1x1xf32>, vector<1x1xf32>) {
-// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<1x1x16xf32>
-// CHECK:         gpu.yield %{{.*}}, %[[SRC]], %{{.*}} : vector<1x16xf32>, vector<1x1x16xf32>, vector<1x16xf32>
-// CHECK-NEXT:  }
-// CHECK:       %[[A:.*]] = vector.extract %[[W]]#2[0, 0] : f32 from vector<1x1xf32>
-// CHECK:       %[[S:.*]] = vector.extract %[[W]]#1[0, 0, 0] : f32 from vector<1x1x1xf32>
-// CHECK:       %[[ADD:.*]] = arith.addf %[[S]], %[[A]] : f32
-// CHECK:       %[[BC:.*]] = vector.broadcast %[[ADD]] : f32 to vector<1x1xf32>
-gpu.func @vector_multi_reduction_3d_trivial_reduction(%laneid: index) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
-    %src = "some_def"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
-      : () -> (vector<1x1x16xf32>)
-    %acc = arith.constant
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>}
-      dense<0.0> : vector<1x16xf32>
-    %1 = vector.multi_reduction <add>, %src, %acc
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>,
-        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>
-      }
-      [1] : vector<1x1x16xf32> to vector<1x16xf32>
-    gpu.yield %1 : vector<1x16xf32>
-  }
-  "some_user_op"(%r) : (vector<1x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
-// CHECK:       %[[OFFSETS:.*]] = arith.constant dense<12> : vector<16xindex>
-// CHECK:       %[[MASKS:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:    -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
-// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] :
-// CHECK-SAME:      vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
-// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
-// CHECK-SAME:    : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %1 = arith.constant dense<1>: vector<16xi1>
-    %offset = arith.constant dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 <{chunk_size=8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
-      : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-    xegpu.store %3, %src[%offset], %1 <{chunk_size=8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
-      : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-  }
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK:       %[[OFFSETS:.*]] = arith.constant dense<12> : vector<16xindex>
-// CHECK:       %[[MASKS:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:    -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
-// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
-// CHECK-SAME:    : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3
-// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
-// CHECK-SAME:    : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %1 = arith.constant dense<1> : vector<16xi1>
-    %offset = arith.constant dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1
-    {
-      layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-    xegpu.store %3, %src[%offset], %1
-    {
-      layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    }
-    : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-  }
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @scatter_ops_with_leading_dims({{.*}}) {
-// CHECK:       %[[OFFSETS:.*]] = arith.constant dense<12> : vector<1x1x16xindex>
-// CHECK:       %[[MASKS:.*]] = arith.constant dense<true> : vector<1x1x16xi1>
-// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:    -> (vector<1x1x1xf16>, memref<256xf16>, vector<1x1x1xindex>, vector<1x1x1xi1>) {
-// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
-// CHECK-SAME:    : vector<1x1x16xf16>, memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[V1:.*]] = vector.shape_cast %[[W]]#2 : vector<1x1x1xindex> to vector<1xindex>
-// CHECK-NEXT:  %[[V2:.*]] = vector.shape_cast %[[W]]#3 : vector<1x1x1xi1> to vector<1xi1>
-// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[V1]]], %[[V2]]
-// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[V1]]], %[[V2]]
-// CHECK-SAME:    : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>, %laneid: index) {
-  gpu.warp_execute_on_lane_0(%laneid)[16] {
-    %1 = arith.constant
-      dense<1> : vector<1x1x16xi1>
-    %offset = arith.constant
-      dense<12> : vector<1x1x16xindex>
-    %3 = xegpu.load %src[%offset], %1 {layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>} 
-    : memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16>
-    xegpu.store %3, %src[%offset], %1 { layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
-    : vector<1x1x16xf16>, memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1>
-  }
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {
-// CHECK:         gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
-// CHECK-NEXT:  arith.index_cast %[[INTPTR]] : index to i64
-gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
-    %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
-    gpu.yield %ptr : index
-  }
-  %ptr_i64 = arith.index_cast %r : index to i64
-  "some_user_op"(%ptr_i64) : (i64) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @memref_alloca(
-// CHECK-NEXT:    %[[ALLOCA:.*]] = memref.alloca() : memref<2048xi8, 3>
-// CHECK-NEXT:    %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ALLOCA]] : memref<2048xi8, 3> -> index
-// CHECK-NEXT:    %[[CAST:.*]] = arith.index_cast %[[INTPTR]] : index to i64
-gpu.func @memref_alloca(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (memref<2048xi8, 3>) {
-    %alloca = memref.alloca() : memref<2048xi8, 3>
-    gpu.yield %alloca :  memref<2048xi8, 3>
-  }
-  %ptr = memref.extract_aligned_pointer_as_index %r : memref<2048xi8, 3> -> index
-  %ptr_i64 = arith.index_cast %ptr : index to i64
-  "some_user_op"(%ptr_i64) : (i64) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @create_memdesc(
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.mem_desc<4x128xf32>, memref<2048xi8, 3>) {
-// CHECK:         gpu.yield %{{.*}}, %{{.*}} : !xegpu.mem_desc<4x128xf32>, memref<2048xi8, 3>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[MDesc:.*]] = xegpu.create_mem_desc %[[W]]#1 : memref<2048xi8, 3> -> !xegpu.mem_desc<4x128xf32>
-gpu.func @create_memdesc(%laneid: index, %arg0 : memref<2048xi8, 3>) {
-  %c0 = arith.constant 0 : index
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.mem_desc<4x128xf32>) {
-    %mdesc = xegpu.create_mem_desc %arg0 : memref<2048xi8, 3> -> !xegpu.mem_desc<4x128xf32>
-    gpu.yield %mdesc :  !xegpu.mem_desc<4x128xf32>
-  }
-  %25 = xegpu.load_matrix  %r[%c0, %c0]: !xegpu.mem_desc<4x128xf32>, index, index -> vector<1x16xf32>
-  "some_user_op"(%25) : (vector<1x16xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_transpose(
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) {
-// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32>
-// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
-gpu.func @vector_transpose(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
-    %cst = "some_op"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
-      : () -> (vector<16x2xf32>)
-    %transpose = vector.transpose %cst, [1, 0]
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1], order = [0, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<16x2xf32> to vector<2x16xf32>
-    gpu.yield %transpose : vector<2x16xf32>
-  }
-  "some_user_op"(%r) : (vector<2x1xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_bitcast(
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) {
-// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8>
-// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8>
-// CHECK:       }
-// CHECK:       vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
-gpu.func @vector_bitcast(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
-    %cst = "some_op"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
-      : () -> (vector<4x32xi8>)
-    %bitcast = vector.bitcast %cst
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<4x32xi8> to vector<4x16xi16>
-    gpu.yield %bitcast : vector<4x16xi16>
-  }
-  "some_user_op"(%r) : (vector<4x1xi16>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing
-// CHECK:         %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) {
-// CHECK:           gpu.yield %{{.*}} : vector<1x16xf32>, vector<16xf32>
-// CHECK:         }
-// CHECK:         %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1xf32> to vector<1x1xf32>
-gpu.func @vector_shapecast_rank_increasing(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
-    %cst = "some_op"()
-      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
-      : () -> (vector<16xf32>)
-    %cast = vector.shape_cast %cst
-      {
-        layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<16xf32> to vector<1x16xf32>
-    gpu.yield %cast : vector<1x16xf32>
-  }
-  "some_user_op"(%r) : (vector<1x1xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_shapecast_rank_reducing(
-// CHECK:         %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1xf32>, vector<1x1xf32>) {
-// CHECK:           gpu.yield %{{.*}} : vector<16xf32>, vector<1x16xf32>
-// CHECK:         }
-// CHECK:         %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1x1xf32> to vector<1xf32>
-gpu.func @vector_shapecast_rank_reducing(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
-    %cst = "some_op"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<1x16xf32>)
-    %cast = vector.shape_cast %cst
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
-      }
-      : vector<1x16xf32> to vector<16xf32>
-    gpu.yield %cast : vector<16xf32>
-  }
-  "some_user_op"(%r) : (vector<1xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL:  gpu.func @vector_shapecast_rank_increasing_without_slicing_layout
-// CHECK:          %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) {
-// CHECK:            %[[T1:.*]] = vector.shape_cast %{{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32>
-// CHECK:            gpu.yield %[[T1]], %{{.*}} : vector<1x16xf32>, vector<16xf32>
-// CHECK:          }
-// CHECK:          %{{.*}} = vector.shape_cast %[[W]]#1 : vector<1xf32> to vector<1x1xf32>
-// CHECK:          gpu.return
-gpu.module @xevm_module{
-gpu.func @vector_shapecast_rank_increasing_without_slicing_layout(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
-    %cst = "some_op"()
-      {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> }
-      : () -> (vector<16xf32>)
-    %cast = vector.shape_cast %cst
-      {
-        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<16xf32> to vector<1x16xf32>
-    gpu.yield %cast : vector<1x16xf32>
-  }
-  "some_user_op"(%r) : (vector<1x1xf32>) -> ()
-  gpu.return
-}
-}
-
-
-// CHECK-LABEL:  gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
-// CHECK-NEXT:     %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) {
-// CHECK-NEXT:       %[[S:.*]] = "some_def"() : () -> vector<24x16xf32>
-// CHECK:            gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:     %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:        {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-// CHECK-NEXT:     "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
-    %0 = "some_def"() : () -> (vector<24x16xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<24x16xf32> to vector<8x16xf32>
-    gpu.yield %1 : vector<8x16xf32>
-  }
-  "some_use"(%r) : (vector<8x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
-// CHECK-NEXT:    %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) {
-// CHECK-NEXT:      %[[S:.*]] = "some_def"() : () -> vector<24x1xf32>
-// CHECK:           gpu.yield %{{.*}}, %[[S]] : vector<8x1xf32>, vector<24x1xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:      {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-// CHECK-NEXT:    "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_non_distributed(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
-    %0 = "some_def"() : () -> (vector<24x1xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<24x1xf32> to vector<8x1xf32>
-    gpu.yield %1 : vector<8x1xf32>
-  }
-  "some_use"(%r) : (vector<8x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
-// CHECK:         %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x4xf32>) {
-// CHECK-NEXT:      %[[S:.*]] = "some_def"() : () -> vector<24x64xf32>
-// CHECK:           gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x64xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:      {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
-// CHECK-NEXT:    "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_inner_distributed(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
-    %0 = "some_def"() : () -> (vector<24x64xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<24x64xf32> to vector<8x16xf32>
-    gpu.yield %1 : vector<8x16xf32>
-  }
-  "some_use"(%r) : (vector<8x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL:  gpu.func @vector_extract_strided_slice_outer_distributed
-// CHECK:          %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x16xf32>, vector<2x16xf32>) {
-// CHECK-NEXT:       %[[S:.*]] = "some_def"() : () -> vector<32x16xf32>
-// CHECK:            gpu.yield %{{.*}}, %[[S]] : vector<16x16xf32>, vector<32x16xf32>
-// CHECK:          }
-// CHECK-NEXT:     %[[T1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:     %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16xf32> to vector<1x16xf32>
-// CHECK-NEXT:     "some_use"(%[[T2]]) : (vector<1x16xf32>) -> ()
-gpu.func @vector_extract_strided_slice_outer_distributed(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x16xf32>) {
-    %0 = "some_def"() : () -> (vector<32x16xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
-      }
-      : vector<32x16xf32> to vector<16x16xf32>
-    gpu.yield %1 : vector<16x16xf32>
-  }
-  "some_use"(%r) : (vector<1x16xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
-// CHECK:         %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<4xf32>) {
-// CHECK:           %[[S:.*]] = "some_def"() : () -> vector<64xf32>
-// CHECK:           gpu.yield %{{.*}}, %[[S]] : vector<32xf32>, vector<64xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:      {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
-// CHECK-NEXT:    "some_use"(%[[T1]]) : (vector<2xf32>) -> ()
-gpu.func @vector_extract_strided_slice_1d(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %0 = "some_def"() : () -> (vector<64xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-      }
-      : vector<64xf32> to vector<32xf32>
-    gpu.yield %1 : vector<32xf32>
-  }
-  "some_use"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_offset
-// CHECK:         %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK:         }
-// CHECK-NOT:     %{{.*}} = vector.extract_strided_slice
-gpu.func @vector_extract_strided_slice_unsopported_offset(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %0 = "some_def"() : () -> (vector<64xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [3], sizes = [32], strides = [1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-      }
-      : vector<64xf32> to vector<32xf32>
-    gpu.yield %1 : vector<32xf32>
-  }
-  "some_use"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_source
-// CHECK:         %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK:         }
-// CHECK-NOT:     %{{.*}} = vector.extract_strided_slice
-gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
-    %0 = "some_def"() : () -> (vector<54xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [0], sizes = [32], strides = [1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-      }
-      : vector<54xf32> to vector<32xf32>
-    gpu.yield %1 : vector<32xf32>
-  }
-  "some_use"(%r) : (vector<2xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL:  gpu.func @vector_extract_strided_slice_partial_offsets
-// CHECK-NEXT:      %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) {
-// CHECK-NEXT:        %[[S:.*]] = "some_def"() : () -> vector<24x16xf32>
-// CHECK:             gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:      %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME:        {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-// CHECK-NEXT:      "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_partial_offsets(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
-    %0 = "some_def"() : () -> (vector<24x16xf32>)
-    %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1],
-        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-        layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-      }
-      : vector<24x16xf32> to vector<8x16xf32>
-    gpu.yield %1 : vector<8x16xf32>
-  }
-  "some_use"(%r) : (vector<8x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL:  gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
-// CHECK-NEXT:      %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) {
-// CHECK-NEXT:        %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
-// CHECK-NEXT:        %[[D:.*]] = "some_def"() : () -> vector<64x16xf32>
-// CHECK:             gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16x16xf32>, vector<64x16xf32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:      %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME:        {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
-// CHECK-NEXT:      "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
-gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) {
-    %0 = "some_def"() : () -> (vector<16x16xf32>)
-    %1 = "some_def"() : () -> (vector<64x16xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0],  strides = [1, 1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-      : vector<16x16xf32> into vector<64x16xf32>
-    gpu.yield %2 : vector<64x16xf32>
-  }
-  "some_use"(%r) : (vector<64x1xf32>) -> ()
-  gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
-// CHECK-NEXT:    %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) {
-// CHECK-NEXT:      %[[S:.*]] = "some_def"() : () -> vector<16x1xf32>
-// CHECK-NEXT:      %[[D:.*]] = "some_def"() : () -> vector<64x1xf32>
-// CHECK:           gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME:      {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
-// CHECK-NEXT:    "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
-gpu.func @vector_insert_strided_slice_non_distributed(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) {
-    %0 = "some_def"() : () -> (vector<16x1xf32>)
-    %1 = "some_def"() : () -> (vector<64x1xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0],  strides = [1, 1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-      : vector<16x1xf32> into vector<64x1xf32>
-    gpu.yield %2 : vector<64x1xf32>
-  }
-  "some_use"(%r) : (vector<64x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
-// CHECK:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
-// CHECK-NEXT:      %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
-// CHECK-NEXT:      %[[D:.*]] = "some_def"() : () -> vector<64x32xf32>
-// CHECK:           gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x32xf32>, vector<16x16xf32>, vector<64x32xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME:      {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
-// CHECK-NEXT:    "some_use"(%[[T1]]) : (vector<64x2xf32>) -> ()
-gpu.func @vector_insert_strided_slice_inner_distributed(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x2xf32>) {
-    %0 = "some_def"() : () -> (vector<16x16xf32>)
-    %1 = "some_def"() : () -> (vector<64x32xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16],  strides = [1, 1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-      : vector<16x16xf32> into vector<64x32xf32>
-    gpu.yield %2 : vector<64x32xf32>
-  }
-  "some_use"(%r) : (vector<64x2xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL:   gpu.func @vector_insert_strided_slice_outer_distributed
-// CHECK:           %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3x32xf32>, vector<1x16xf32>, vector<3x32xf32>) {
-// CHECK-NEXT:        %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
-// CHECK-NEXT:        %[[D:.*]] = "some_def"() : () -> vector<48x32xf32>
-// CHECK:             gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48x32xf32>, vector<16x16xf32>, vector<48x32xf32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:      %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME:        {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
-// CHECK-NEXT:      "some_use"(%[[T1]]) : (vector<3x32xf32>) -> ()
-gpu.func @vector_insert_strided_slice_outer_distributed(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3x32xf32>) {
-    %0 = "some_def"() : () -> (vector<16x16xf32>)
-    %1 = "some_def"() : () -> (vector<48x32xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4],  strides = [1, 1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
-    }
-      : vector<16x16xf32> into vector<48x32xf32>
-    gpu.yield %2 : vector<48x32xf32>
-  }
-  "some_use"(%r) : (vector<3x32xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
-// CHECK:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>, vector<1xf32>, vector<3xf32>) {
-// CHECK-NEXT:      %[[S:.*]] = "some_def"() : () -> vector<16xf32>
-// CHECK-NEXT:      %[[D:.*]] = "some_def"() : () -> vector<48xf32>
-// CHECK:           gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48xf32>, vector<16xf32>, vector<48xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME:      {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
-// CHECK-NEXT:    "some_use"(%[[T1]]) : (vector<3xf32>) -> ()
-gpu.func @vector_insert_strided_slice_1d(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
-    %0 = "some_def"() : () -> (vector<16xf32>)
-    %1 = "some_def"() : () -> (vector<48xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [16],  strides = [1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    }
-      : vector<16xf32> into vector<48xf32>
-    gpu.yield %2 : vector<48xf32>
-  }
-  "some_use"(%r) : (vector<3xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_
diff erent_ranks
-// CHECK-NEXT:      %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<1xf32>, vector<64x1xf32>) {
-// CHECK-NEXT:        %[[S:.*]] = "some_def"() : () -> vector<16xf32>
-// CHECK-NEXT:        %[[D:.*]] = "some_def"() : () -> vector<64x16xf32>
-// CHECK:             gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16xf32>, vector<64x16xf32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:      %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME:        {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
-// CHECK-NEXT:      "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
-gpu.func @vector_insert_strided_slice_
diff erent_ranks(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) {
-    %0 = "some_def"() : () -> (vector<16xf32>)
-    %1 = "some_def"() : () -> (vector<64x16xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0],  strides = [1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-      : vector<16xf32> into vector<64x16xf32>
-    gpu.yield %2 : vector<64x16xf32>
-  }
-  "some_use"(%r) : (vector<64x1xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL:  gpu.func @vector_insert_strided_slice_unsupported_source
-// CHECK:          %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) {
-// CHECK:          }
-// CHECK-NOT:      %{{.*}} = vector.insert_strided_slice
-gpu.func @vector_insert_strided_slice_unsupported_source(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
-    %0 = "some_def"() : () -> (vector<8xf32>)
-    %1 = "some_def"() : () -> (vector<48xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [16],  strides = [1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    }
-      : vector<8xf32> into vector<48xf32>
-    gpu.yield %2 : vector<48xf32>
-  }
-  "some_use"(%r) : (vector<3xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL:  gpu.func @vector_insert_strided_slice_unsupported_offset
-// CHECK:          %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) {
-// CHECK:          }
-// CHECK-NOT:      %{{.*}} = vector.insert_strided_slice
-gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
-  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
-    %0 = "some_def"() : () -> (vector<16xf32>)
-    %1 = "some_def"() : () -> (vector<48xf32>)
-    %2 = vector.insert_strided_slice %0, %1 { offsets = [3],  strides = [1],
-      layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    }
-      : vector<16xf32> into vector<48xf32>
-    gpu.yield %2 : vector<48xf32>
-  }
-  "some_use"(%r) : (vector<3xf32>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane
-// CHECK-SAME: (%[[ARG0:.*]]: index) {
-// CHECK: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1x16x1xf16>, vector<1xf16>, vector<16x1xf16>)
-// CHECK: %[[DEF0:.*]] = "some_def"() : () -> vector<16xf16>
-// CHECK: %[[DEF1:.*]] = "some_def"() : () -> vector<16x16xf16>
-// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF0]]
-// CHECK: %[[CAST_INNER:.*]] = vector.shape_cast %[[DEF1]] : vector<16x16xf16> to vector<1x16x16xf16>
-// CHECK: gpu.yield %[[BCAST_INNER]], %[[CAST_INNER]], %[[DEF0]], %[[DEF1]]
-// CHECK: %[[CAST:.*]] = vector.shape_cast %[[R]]#3 : vector<16x1xf16> to vector<1x16x1xf16>
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#2 : vector<1xf16> to vector<16x1xf16>
-// CHECK: "some_use"(%[[BCAST]]) : (vector<16x1xf16>) -> ()
-// CHECK: "some_use"(%[[CAST]]) : (vector<1x16x1xf16>) -> ()
-gpu.func  @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane(%laneid: index) {
-
-  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>, vector<1x16x1xf16>) {
-
-    %1 = "some_def"() : () -> vector<16xf16>
-    %3 = "some_def"() : () -> vector<16x16xf16>
-
-    %2 = vector.broadcast %1 {
-      layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    } : vector<16xf16> to vector<16x16xf16>
-
-    %4 = vector.broadcast %3 {
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
-    } : vector<16x16xf16> to vector<1x16x16xf16>
-
-    gpu.yield %2, %4 : vector<16x16xf16>, vector<1x16x16xf16>
-  }
-  "some_use"(%r#0) : (vector<16x1xf16>) -> ()
-  "some_use"(%r#1) : (vector<1x16x1xf16>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case
-// CHECK-SAME: (%[[ARG0:.*]]: index)
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<16x1xf16>)
-// CHECK:   %[[DEF:.*]] = "some_def"() : () -> vector<16x1xf16>
-// CHECK:   %[[BCAST:.*]] = vector.broadcast %[[DEF]]
-// CHECK-SAME: : vector<16x1xf16> to vector<16x16xf16>
-// CHECK:   gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, vector<16x1xf16>
-// CHECK: "some_use"(%[[R]]#1) : (vector<16x1xf16>) -> ()
-gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: index) {
-  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
-    %1 = "some_def"() : () -> vector<16x1xf16>
-    %2 = vector.broadcast %1 {
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    } : vector<16x1xf16> to vector<16x16xf16>
-    gpu.yield %2: vector<16x16xf16>
-  }
-  "some_use"(%0) : (vector<16x1xf16>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector
-// CHECK-SAME: (%[[ARG0:.*]]: index)
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, f16)
-// CHECK: %[[DEF:.*]] = "some_def"()
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
-// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
-// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x1xf16>
-// CHECK: "some_use"(%[[RESULT]])
-gpu.func
- at vector_shape_cast_scalar_to_vector(%arg0: index) {
-  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
-    %1 = "some_def"() : () -> f16
-    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
-    gpu.yield %2 : vector<16x16xf16>
-  }
-  "some_use"(%0) : (vector<16x1xf16>) -> ()
-  gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector_uniform
-// CHECK-SAME: (%[[ARG0:.*]]: index)
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x16xf16>, f16)
-// CHECK: %[[DEF:.*]] = "some_def"()
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] : f16 to vector<16x16xf16>
-// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
-// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x16xf16>
-// CHECK: "some_use"(%[[RESULT]])
-  gpu.func @vector_shape_cast_scalar_to_vector_uniform(%arg0: index) {
-    %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x16xf16>) {
-      %1 = "some_def"() : () -> f16
-      %2 = vector.broadcast %1 : f16 to vector<16x16xf16>
-      gpu.yield %2 : vector<16x16xf16>
-    }
-    "some_use"(%0) : (vector<16x16xf16>) -> ()
-    gpu.return
-  }
-
-// CHECK-LABEL: gpu.func @vector_step_slice
-// CHECK:         (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
-// CHECK:         %[[LANE_ID_IN_SLICED_DIM:.*]] = arith.remui %[[LANE_ID]], %c16 : index
-// CHECK-NEXT:    %[[LANE_ID_IN_SLICED_DIM1:.*]] = arith.remui %[[LANE_ID_IN_SLICED_DIM]], %c16 : index
-// CHECK-NEXT:    %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = vector.broadcast %[[LANE_ID_IN_SLICED_DIM1]] : index to vector<1xindex>
-// CHECK-NEXT:    "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
-  gpu.func @vector_step_slice(%arg0: index) {
-    %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
-      %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex>
-      gpu.yield %5 : vector<16xindex>
-    }
-    "some_use"(%0) : (vector<1xindex>) -> ()
-    gpu.return
-  }
-
-  // CHECK-LABEL: gpu.func @vector_step_slice_unit
-  // CHECK:         (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
-  // CHECK-NEXT:    %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
-  // CHECK-NEXT:    "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
-  gpu.func @vector_step_slice_unit(%arg0: index) {
-    %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
-      %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex>
-      gpu.yield %5 : vector<1xindex>
-    }
-    "some_use"(%0) : (vector<1xindex>) -> ()
-    gpu.return
-  }
-
-  // CHECK-LABEL: gpu.func @vector_step_slice_multi_dist_unit
-  // CHECK:         (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
-  // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG:    %[[DIST_UNIT_SIZE:.*]] = arith.constant 8 : index
-  // CHECK-DAG:    %[[SG_LEVEL_VECSIZE:.*]] = arith.constant 16 : index
-  // CHECK-DAG:    %[[LANE_LAYOUT:.*]] = arith.constant 4 : index
-  // CHECK-DAG:    %[[LANE_DATA:.*]] = arith.constant 2 : index
-  // CHECK-DAG:    %[[LANE_DIST_UNIT_START_IDX:.*]] = arith.divui %[[LANE_ID]], %[[LANE_DATA]] : index
-  // CHECK-DAG:    %[[DIST_UNIT_0_IDX:.*]] = arith.remui %[[LANE_DIST_UNIT_START_IDX]], %[[LANE_LAYOUT]] : index
-  // CHECK-DAG:    %[[DIST_UNIT_0_OFFSET:.*]] = arith.muli %[[DIST_UNIT_0_IDX]], %[[LANE_DATA]] : index
-  // CHECK-DAG:    %[[DIST_UNIT_0_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_0_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
-  // CHECK-DAG:    %[[DIST_UNIT_1_OFFSET:.*]] = arith.addi %[[DIST_UNIT_0_OFFSET]], %[[DIST_UNIT_SIZE]] : index
-  // CHECK-DAG:    %[[DIST_UNIT_1_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_1_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
-  // CHECK-DAG:    %[[V6:.*]] = arith.addi %[[DIST_UNIT_0_SUBRANGE_START]], %[[C1]] : index
-  // CHECK-DAG:    %[[V7:.*]] = arith.addi %[[DIST_UNIT_1_SUBRANGE_START]], %[[C1]] : index
-  // CHECK-DAG:    %[[VEC:.*]] = vector.from_elements
-  // CHECK-SAME:     %[[DIST_UNIT_0_SUBRANGE_START]], %[[V6]],
-  // CHECK-SAME:     %[[DIST_UNIT_1_SUBRANGE_START]], %[[V7]]
-  // CHECK-SAME:     : vector<4xindex>
-  // CHECK-NEXT:    "some_use"(%[[VEC]]) : (vector<4xindex>) -> ()
-  gpu.func @vector_step_slice_multi_dist_unit(%arg0: index) {
-    %0 = gpu.warp_execute_on_lane_0(%arg0)[4] -> (vector<4xindex>) {
-      %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1,2,1]>, dims = [0, 2]>} : vector<16xindex>
-      gpu.yield %5 : vector<16xindex>
-    }
-    "some_use"(%0) : (vector<4xindex>) -> ()
-    gpu.return
-  }
-
-  // CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible(
-  // CHECK: %[[R:.*]] = gpu.warp_execute_on_lane_0
-  // CHECK-NOT: xegpu.convert_layout
-  // CHECK: gpu.yield %{{.*}} : vector<16xf32>
-  gpu.func @convert_layout_removed_when_compatible(%laneid: index){
-    %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>, vector<1xf32>) {
-      %0 = "some_op"() : () -> vector<16xf32>
-      %2 = "some_op"() : () -> vector<1xf32>
-      %1 = xegpu.convert_layout %0
-        <{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
-        target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> 
-        : vector<16xf32>
-      %3 = xegpu.convert_layout %2
-        <{input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, 
-        target_layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}>
-        : vector<1xf32>
-      %4 = xegpu.convert_layout %3
-        <{input_layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>,
-        target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [0, 1]>}> 
-        : vector<1xf32>
-      gpu.yield %1, %4 : vector<16xf32>, vector<1xf32>
-    }
-    "some_user_op"(%r#0, %r#1) : (vector<1xf32>, vector<1xf32>) -> ()
-    gpu.return
-  }
-
-  // CHECK-NOT: xegpu.convert_layout
-  // CHECK: gpu.yield %{{.*}} : f32
-  gpu.func @convert_layout_scalar(%laneid: index){
-    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (f32) {
-      %0 = "some_op"() : () -> f32
-      %1 = xegpu.convert_layout %0
-        <{input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>,
-        target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>}> 
-        : f32
-      gpu.yield %1 : f32
-    }
-    "some_user_op"(%r) : (f32) -> ()
-    gpu.return
-  }
-}

diff  --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
deleted file mode 100644
index 285669cae7174..0000000000000
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ /dev/null
@@ -1,439 +0,0 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \
-// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: gpu.func @load_dpas_postop_store
-// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>,
-// CHECK-SAME:      %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK:         %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK:         %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK:         %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK:         %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG:     %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK:         %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-// CHECK:         %[[T6:.*]] = math.exp %[[T5]] : vector<8x1xf32>
-// CHECK-DAG:     %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-DAG:     %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:         xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
-      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
-      !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-      -> vector<16x16xf16>
-
-    %4 = xegpu.dpas %1, %3
-      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-
-    %5 = math.exp %4
-      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<8x16xf32>
-
-    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @gemm
-// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>,
-// CHECK-SAME:     %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
-// CHECK-DAG:         %[[BLOCK_ID_X:.*]] = gpu.block_id x
-// CHECK-DAG:         %[[BLOCK_ID_Y:.*]] = gpu.block_id y
-// CHECK-DAG:         %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK-DAG:         %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
-// CHECK:             %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT:        %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK-NEXT:        %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
-// CHECK:             %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]])
-// CHECK-SAME:          -> (vector<8x1xf32>) {
-// CHECK-DAG:           %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-// CHECK-DAG:           %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK-DAG:           %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK-DAG:           %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK-DAG:           %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT:          %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]]
-// CHECK-SAME:            : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT:          %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
-// CHECK-NEXT:          scf.yield %[[T16]] : vector<8x1xf32>
-// CHECK-NEXT:        }
-// CHECK-NEXT:        %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT:        xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
-gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
-  %c0 = arith.constant 0 : index
-  %c16 = arith.constant 16 : index
-  %c8 = arith.constant 8 : index
-  %c1024 = arith.constant 1024 : index
-  %block_id_x = gpu.block_id x
-  %block_id_y = gpu.block_id y
-  %0 = arith.muli %block_id_x, %c8 : index
-  %1 = arith.muli %block_id_y, %c16 : index
-  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %3 = xegpu.load_nd %2[%0, %1]
-    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
-
-  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
-
-    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
-      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-
-    %7 = xegpu.load_nd %5[%0, %arg3]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
-    %8 = xegpu.load_nd %6[%arg3, %1]
-      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
-
-    %9 = xegpu.dpas %7, %8, %arg4
-      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
-
-    scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-
-  xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_yield
-// CHECK:         (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
-// CHECK-DAG:      %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
-// CHECK-DAG:      %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-DAG:      %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK:          %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
-// CHECK-NEXT:        %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME:          : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT:        %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
-// CHECK-NEXT:        scf.yield %[[LD_CAST]] : vector<1x8xf16>
-// CHECK-NEXT:      } else {
-// CHECK-NEXT:        scf.yield %[[CST]] : vector<1x8xf16>
-// CHECK-NEXT:      }
-// CHECK-NEXT:      %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
-// CHECK-NEXT:      xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME:        vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %loaded = scf.if %pred -> (vector<16x8xf16>) {
-      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-        layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
-      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-      scf.yield %3 : vector<16x8xf16>
-    } else {
-      %3 = arith.constant {
-        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
-      } dense<12.> : vector<16x8xf16>
-      scf.yield %3 : vector<16x8xf16>
-    } { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
-    xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
-// CHECK:         %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK:         %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK:         %[[PREDICATE:.*]] = llvm.mlir.poison : i1
-// CHECK:         scf.if %[[PREDICATE]] {
-// CHECK-NEXT:      %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME:         memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT:      xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME:         vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-// CHECK-NEXT:    }
-gpu.module @xevm_module{
-  gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
-    %pred = llvm.mlir.poison : i1
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    scf.if %pred  {
-      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-        layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
-      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    }
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @mma_transpose_b(
-// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK-DAG:     %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-DAG:     %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
-// CHECK-DAG:     %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}>
-// CHECK-SAME:      !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
-// CHECK-NEXT:    %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
-// CHECK-NEXT:    %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
-// CHECK-NEXT:    %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
-// CHECK-NEXT:    %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-gpu.module @xevm_module{
-  gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
-      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
-      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]  {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
-      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>> -> vector<16x8xi32>
-    %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2], order = [0, 1]>}
-      : vector<16x8xi32> to vector<16x16xf16>
-    %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
-      : vector<16x16xf16> to vector<16x16xf16>
-    %6 = xegpu.dpas %1, %5
-      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
-      -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %6, %7[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
-      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
-// CHECK:         %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
-// CHECK-SAME:      !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
-// CHECK-SAME:      memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
-// CHECK:           gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
-// CHECK:         }
-// CHECK:         %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
-// CHECK-SAME:      (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
-// CHECK:           %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME:        args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
-// CHECK:             gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
-// CHECK:           }
-// CHECK:           scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
-// CHECK:         }
-gpu.module @xevm_module{
-  gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index,
-    %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
-    %arg2: memref<16x16xf32>) {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c0 = arith.constant 0 : index
-    %ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<16x1xf32>)
-    %ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : () -> (vector<16x16xf32>)
-    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) {
-      %1  = "some_def"(%arg5)
-        {
-          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-        }
-        : (vector<16x1xf32>) -> (vector<16x1xf32>)
-      %acc = "some_def"(%arg4, %1)
-        {
-          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-          layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
-          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-        }
-        : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>)
-      scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32>
-    }
-    {
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-    xegpu.store_nd %3#0, %arg1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
-// CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[C8:.*]] = arith.constant 8 : index
-// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
-// CHECK: %[[REMU1:.*]] = arith.remui %[[LANE_ID]], %[[C8]]
-// CHECK: %[[DIVU:.*]] = arith.divui %[[LANE_ID]], %[[C8]]
-// CHECK: %[[REMU2:.*]] = arith.remui %[[DIVU]], %[[C2]]
-// CHECK: %[[REMU3:.*]] = arith.remui %[[REMU2]], %[[C2]]
-// CHECK: %[[REMU4:.*]] = arith.remui %[[REMU1]], %[[C8]]
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
-gpu.module @xevm_module{
-  gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
-    %c0 = arith.constant 0 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
-// CHECK: %[[C8:.*]] = arith.constant 8 : index
-// CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[C4:.*]] = arith.constant 4 : index
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
-// CHECK: %[[REMU1:.*]] = arith.remui %[[LANE_ID]], %[[C4]]
-// CHECK: %[[DIVU:.*]] = arith.divui %[[LANE_ID]], %[[C4]]
-// CHECK: %[[REMU2:.*]] = arith.remui %[[DIVU]], %[[C4]]
-// CHECK: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C2]]
-// CHECK: %[[REMU3:.*]] = arith.remui %[[MUL]], %[[C8]]
-// CHECK: %[[REMU4:.*]] = arith.remui %[[REMU1]], %[[C4]]
-// CHECK: %[[ADD:.*]] = arith.addi %[[REMU4]], %[[C1]]
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
-gpu.module @xevm_module{
-  gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) {
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
-// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
-// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index
-gpu.module @xevm_module{
-  gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
-      !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
-    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
-      vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
-gpu.module @xevm_module{
-   gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
-    %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
-    // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
-    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
-    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case({{.*}}) {
-gpu.module @xevm_module{
-   gpu.func  @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
-    %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
-
-    %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
-    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
-    // CHECK-NOT: vector.broadcast
-    // CHECK-NOT: vector.shape_cast
-
-    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
-    // CHECK-SAME: : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-
-    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector({{.*}}) {
-gpu.module @xevm_module{
-   gpu.func  @vector_shape_cast_scalar_to_vector(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %9 = gpu.block_id x
-    %10 = arith.index_cast %9 : index to i16
-    %11 = arith.bitcast %10 : i16 to f16
-    // CHECK: vector.broadcast {{.*}} : f16 to vector<16xf16>
-    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
-    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
-      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-gpu.module @xevm_test {
-    // CHECK-LABEL: gpu.func @vector_reduce_2d
-    // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
-    // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
-    // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
-    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
-    // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32
-    // CHECK-DAG: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32
-    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-    // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
-    // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
-    // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
-    // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32>
-    // CHECK: %[[LOADED_REDUCED:.*]] = vector.reduction <add>, %[[LOADED]], %[[CST_1]] : vector<4xf32> into f32
-    // CHECK: %[[SHUFFLE_0:.*]], %{{.*}} = gpu.shuffle xor %[[LOADED_REDUCED]], %[[C1]], %[[C16]] : f32
-    // CHECK: %[[VEC_RED_0:.*]] = arith.addf %[[LOADED_REDUCED]], %[[SHUFFLE_0]] : f32
-    // CHECK: %[[SHUFFLE_1:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_0]], %[[C2]], %[[C16]] : f32
-    // CHECK: %[[VEC_RED_1:.*]] = arith.addf %[[VEC_RED_0]], %[[SHUFFLE_1]] : f32
-    // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_1]], %[[C4]], %[[C16]] : f32
-    // CHECK: %[[VEC_RED_2:.*]] = arith.addf %[[VEC_RED_1]], %[[SHUFFLE_2]] : f32
-    // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32
-    // CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32
-    // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_3]] : f32 to vector<1xf32>
-    // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
-  gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
-      %cst = arith.constant 1.000000e+00 : f32
-      %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
-      %2 = vector.broadcast %cst : f32 to vector<16xf32>
-      %3 = vector.multi_reduction <add>, %1, %2 [0] : vector<4x16xf32> to vector<16xf32>
-      %4 = vector.reduction <add>, %3 : vector<16xf32> into f32
-      %40 = xegpu.convert_layout %4 <{input_layout =  #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims = [0]>, target_layout =  #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims = [0]>}>: f32
-      %5 = vector.broadcast %40 : f32 to vector<16xf32>
-      %cst_0 = arith.constant dense<0> : vector<16xindex>
-      %cst_1 = arith.constant dense<true> : vector<16xi1>
-      xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}

diff  --git a/mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir b/mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir
deleted file mode 100644
index c3fdd9c90ffd5..0000000000000
--- a/mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-opt --xegpu-subgroup-distribute -split-input-file %s | FileCheck %s
-// Regression test for https://github.com/llvm/llvm-project/issues/181531:
-// Running --xegpu-subgroup-distribute without a chip target attribute used to
-// call llvm_unreachable in getUArch(). The pass should now bail out gracefully.
-
-// CHECK-LABEL: gpu.func @no_crash_without_chip_attr
-// CHECK:       gpu.return
-gpu.module @test_module {
-  gpu.func @no_crash_without_chip_attr(%arg0: memref<8x16xf16>, %arg1: memref<8x16xf16>) {
-    gpu.return
-  }
-}

diff  --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 581072f5218cd..5c3721630837d 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -231,51 +231,20 @@ struct TestXeGPURecoverTemporaryLayouts
   }
 };
 
-struct TestXeGPUSGDistribute
-    : public PassWrapper<TestXeGPUSGDistribute,
-                         OperationPass<gpu::GPUModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute)
-
-  StringRef getArgument() const final { return "test-xegpu-sg-distribute"; }
-
-  StringRef getDescription() const final {
-    return "Test the implementation of XeGPU Subgroup Distribution";
-  }
-
-  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
-    registry.insert<arith::ArithDialect>();
-    registry.insert<memref::MemRefDialect>();
-    registry.insert<xegpu::XeGPUDialect>();
-    registry.insert<vector::VectorDialect>();
-    registry.insert<index::IndexDialect>();
-  }
-
-  TestXeGPUSGDistribute() = default;
-  TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default;
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-  }
-};
-
-/// This test pass is intended to test the subgroup to workitem distribution of
+/// This test pass is intended to test the subgroup to lane distribution of
 /// xegpu/vector/arith operations in isolation, it does not handle any
 /// structural ops like scf.for etc.
-struct TestXeGPUSgToWiDistributeExperimental
-    : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
+struct TestXeGPUSgToLaneDistribute
+    : public PassWrapper<TestXeGPUSgToLaneDistribute,
                          OperationPass<gpu::GPUModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
-      TestXeGPUSgToWiDistributeExperimental)
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSgToLaneDistribute)
 
   StringRef getArgument() const final {
-    return "test-xegpu-sg-to-wi-distribute-experimental";
+    return "test-xegpu-sg-to-lane-distribute";
   }
 
   StringRef getDescription() const final {
-    return "Test the experimental implementation of XeGPU Subgroup to "
-           "Work-item Distribution";
+    return "Test the implementation of XeGPU Subgroup to Lane Distribution";
   }
 
   void getDependentDialects(::mlir::DialectRegistry &registry) const override {
@@ -287,9 +256,8 @@ struct TestXeGPUSgToWiDistributeExperimental
     registry.insert<gpu::GPUDialect>();
   }
 
-  TestXeGPUSgToWiDistributeExperimental() = default;
-  TestXeGPUSgToWiDistributeExperimental(
-      const TestXeGPUSgToWiDistributeExperimental &pass)
+  TestXeGPUSgToLaneDistribute() = default;
+  TestXeGPUSgToLaneDistribute(const TestXeGPUSgToLaneDistribute &pass)
       : PassWrapper(pass) {}
 
   void runOnOperation() override {
@@ -313,42 +281,12 @@ struct TestXeGPUSgToWiDistributeExperimental
 
     ConversionTarget target(*ctx);
     RewritePatternSet patterns(ctx);
-    xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+    xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
         typeConverter, patterns, target);
     (void)applyPartialConversion(op, target, std::move(patterns));
   }
 };
 
-struct TestXeGPUMoveFuncBodyToWarpOp
-    : public PassWrapper<TestXeGPUMoveFuncBodyToWarpOp,
-                         OperationPass<gpu::GPUModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUMoveFuncBodyToWarpOp)
-
-  StringRef getArgument() const final {
-    return "test-xegpu-move-func-to-warp-op";
-  }
-
-  StringRef getDescription() const final {
-    return "Test the implementation of XeGPU move gpu function body to "
-           "WarpExecuteOnLane0 op.";
-  }
-
-  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
-    registry.insert<xegpu::XeGPUDialect>();
-    registry.insert<gpu::GPUDialect>();
-  }
-
-  TestXeGPUMoveFuncBodyToWarpOp() = default;
-  TestXeGPUMoveFuncBodyToWarpOp(const TestXeGPUMoveFuncBodyToWarpOp &pass) =
-      default;
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
-    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-  }
-};
-
 struct TestXeGPUPropagateLayouts
     : public PassWrapper<TestXeGPUPropagateLayouts,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -516,9 +454,7 @@ void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
   PassRegistration<TestXeGPULayoutInterface>();
   PassRegistration<TestXeGPURecoverTemporaryLayouts>();
-  PassRegistration<TestXeGPUSGDistribute>();
-  PassRegistration<TestXeGPUSgToWiDistributeExperimental>();
-  PassRegistration<TestXeGPUMoveFuncBodyToWarpOp>();
+  PassRegistration<TestXeGPUSgToLaneDistribute>();
   PassRegistration<TestXeGPUPropagateLayouts>();
   PassRegistration<TestXeGPUResolveLayoutConflicts>();
   PassRegistration<TestXeGPUArrayLengthOptimization>();