[Mlir-commits] [mlir] [MLIR] [XeGPU] Add distribution patterns for vector transpose, bitcast & mask ops in sg to wi pass (PR #187392)

Mon Mar 30 13:54:32 PDT 2026

https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/187392

>From 70490321392a35908560171c98aef136e2edf76f Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 19 Feb 2026 19:30:04 +0000
Subject: [PATCH 1/9] Add distribution pattern for vector.transpose &
 vector.bitcast

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 86 ++++++++++++++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 36 ++++++++
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..c35daeb1937e3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -489,6 +489,79 @@ struct SgToWiMultiDimReduction
   }
 };
 
+/// Distributes a subgroup-level vector.transpose op to workitem-level.
+/// Only 2D transposes are supported. The result layout must be a transpose of
+/// the source layout. An equivalent vector::TransposeOp is created with
+/// distributed vector types.
+struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
+  using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::TransposeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getTemporaryLayout(op->getOpOperand(0));
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!sourceLayout || !resultLayout)
+      return rewriter.notifyMatchFailure(
+          op, "the source or result vector of the transpose op lacks layout "
+              "attribute");
+    int64_t sourceRank = op.getSourceVectorType().getRank();
+    int64_t resultRank = op.getResultVectorType().getRank();
+    // Only 2D transposes are supported.
+    if (sourceRank != 2 || resultRank != 2)
+      return rewriter.notifyMatchFailure(
+          op, "the source or result vector of the transpose op "
+              "does not have 2D layout");
+    ArrayRef<int64_t> perm = op.getPermutation();
+    // Result layout must be a transpose of source layout.
+    if (!resultLayout.isTransposeOf(sourceLayout, perm))
+      return rewriter.notifyMatchFailure(
+          op, "the source or result vector layouts must be 2D transposes of "
+              "each other");
+    FailureOr<VectorType> distributedResultTypeOrFailure =
+        getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());
+    if (failed(distributedResultTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "Failed to distribute the result vector type in "
+              "vector::Transpose op");
+    auto newOp = vector::TransposeOp::create(rewriter, op.getLoc(),
+                                             adaptor.getVector(), perm);
+    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
+                                       distributedResultTypeOrFailure.value()));
+    return success();
+  }
+};
+
+/// Distributes a subgroup-level vector.bitcast op to workitem-level.
+/// Bitcast only impacts the innermost dimension of the source/result vectors.
+/// An equivalent vector::BitCastOp is created with distributed vector types.
+struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
+  using OpConversionPattern<vector::BitCastOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::BitCastOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!resultLayout)
+      return rewriter.notifyMatchFailure(
+          op, "result vector of the bitcast op lacks layout attribute");
+    FailureOr<VectorType> distributedResultTypeOrFailure =
+        getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());
+    if (failed(distributedResultTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "Failed to distribute the result vector type in "
+              "vector::BitCast op");
+    auto newOp = vector::BitCastOp::create(
+        rewriter, op.getLoc(), distributedResultTypeOrFailure.value(),
+        adaptor.getSource());
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
 /// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
 /// of vector.extract_strided_slice, vector.reduction and
 /// vector.insert_strided_slice ops. This is used when the reduction dimension
@@ -727,10 +800,21 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         // Lane local reductions are illegal at this point and must be lowered.
         return !isReductionLaneLocal(op);
       });
+  // vector::TransposeOp is legal only if it has no result layout attribute.
+  target.addDynamicallyLegalOp<vector::TransposeOp>(
+      [=](vector::TransposeOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
+  // vector::BitCastOp is legal only if it has no result layout attribute.
+  target.addDynamicallyLegalOp<vector::BitCastOp>(
+      [=](vector::BitCastOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
                SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
-               SgToWiVectorReduction, SgToWiMultiDimReduction>(
+               SgToWiVectorReduction, SgToWiMultiDimReduction,
+               SgToWiVectorTranspose, SgToWiVectorBitcast>(
       typeConverter, patterns.getContext());
 }
 
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..73645d2f7b252 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -317,4 +317,40 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
       [1] : vector<16x12xf32> to vector<16xf32>
   gpu.return
 }
+
+// CHECK-LABEL: gpu.func @vector_transpose
+// CHECK:         %[[SRC:.*]] = "some_op"()
+// CHECK:         %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
+// CHECK-NEXT:    %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+// CHECK-NEXT:    gpu.return
+gpu.func @vector_transpose() {
+  %cst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : () -> (vector<16x2xf32>)
+  %transpose = vector.transpose %cst, [1, 0]
+    {
+      layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16x2xf32> to vector<2x16xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_bitcast
+// CHECK:         %[[SRC:.*]] = "some_op"()
+// CHECK:         %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<4x32xi8> to vector<4x2xi8>
+// CHECK-NEXT:    %[[BC:.*]] = vector.bitcast %[[CAST]] : vector<4x2xi8> to vector<4x1xi16>
+// CHECK-NEXT:    gpu.return
+gpu.func @vector_bitcast() {
+  %cst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+    : () -> (vector<4x32xi8>)
+  %bitcast = vector.bitcast %cst
+    {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<4x32xi8> to vector<4x16xi16>
+  gpu.return
+}
 }

>From 44aba8df1c3d061a0fc9e2edc5f3919c632be0da Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 17 Mar 2026 22:11:26 +0000
Subject: [PATCH 2/9] Add patterns for CreateMask and ConstantMask

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 96 ++++++++++++++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 48 ++++++++++
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 9ca08a3ba53ce..fe5a143b11fa6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -744,6 +745,85 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
   }
 };
 
+/// Distributes a subgroup-level vector.create_mask or vector.constant_mask op
+/// to workitem-level. Each lane computes its own mask bounds based on its
+/// lane coordinates. For each dimension i, the new mask bound is:
+///   new_bound[i] = original_bound[i] - lane_coord[i] * dist_shape[i]
+/// vector.create_mask implicitly clamps to [0, vector_size].
+/// For constant_mask, the constant dim sizes are first materialized as
+/// Values, then the same logic applies, producing a vector.create_mask.
+template <typename OpType,
+          typename = std::enable_if_t<llvm::is_one_of<
+              OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>
+struct SgToWiCreateMask : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getTemporaryLayout(op->getOpResult(0));
+    if (!layout || !layout.isForSubgroup())
+      return rewriter.notifyMatchFailure(
+          op, "operation result does not have subgroup distribute layout");
+
+    VectorType origType = op.getType();
+    FailureOr<VectorType> distTypeOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, origType);
+    if (failed(distTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute workitem vector type from the layout");
+
+    VectorType distType = distTypeOrFailure.value();
+    Location loc = op.getLoc();
+
+    // Materialize the original mask operands as Values.
+    SmallVector<Value> origOperands;
+    if constexpr (std::is_same_v<OpType, vector::CreateMaskOp>) {
+      origOperands.append(op.getOperands().begin(), op.getOperands().end());
+    } else {
+      auto dimSizes = op.getMaskDimSizesAttr().asArrayRef();
+      for (auto dimSize : dimSizes)
+        origOperands.push_back(
+            arith::ConstantIndexOp::create(rewriter, loc, dimSize).getResult());
+    }
+
+    ArrayRef<int64_t> origShape = origType.getShape();
+    ArrayRef<int64_t> distShape = distType.getShape();
+
+    // Delinearize lane ID using the layout.
+    Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
+                                         /*upperBound=*/mlir::IntegerAttr());
+    auto maybeIds = layout.delinearizeId(rewriter, loc, laneId);
+    if (failed(maybeIds))
+      return rewriter.notifyMatchFailure(
+          op, "failed to delinearize lane ID from layout");
+    SmallVector<Value> laneIds = maybeIds.value();
+
+    // Compute new mask operands.
+    AffineExpr s0, s1;
+    bindSymbols(rewriter.getContext(), s0, s1);
+    SmallVector<Value> newOperands;
+    for (int i = 0, e = distShape.size(); i < e; ++i) {
+      if (origShape[i] == distShape[i]) {
+        // Dimension is not distributed, keep the original operand.
+        newOperands.push_back(origOperands[i]);
+      } else {
+        // new_bound = original_bound - lane_coord * dist_size
+        Value maskDimIdx = affine::makeComposedAffineApply(
+            rewriter, loc, s1 - s0 * distShape[i],
+            {laneIds[i], origOperands[i]});
+        newOperands.push_back(maskDimIdx);
+      }
+    }
+
+    auto newMask =
+        vector::CreateMaskOp::create(rewriter, loc, distType, newOperands);
+    rewriter.replaceOp(op, newMask.getResult());
+    return success();
+  }
+};
+
 /// This pattern distributes a subgroup-level StoreMatrix op to workitem-level.
 struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
   using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
@@ -1120,6 +1200,16 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       [=](vector::MultiDimReductionOp op) -> bool {
         return !isValidSubgroupMultiReductionOp(op);
       });
+  // vector::CreateMaskOp is legal only if it has no result layout attribute.
+  target.addDynamicallyLegalOp<vector::CreateMaskOp>(
+      [=](vector::CreateMaskOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
+  // vector::ConstantMaskOp is legal only if it has no result layout attribute.
+  target.addDynamicallyLegalOp<vector::ConstantMaskOp>(
+      [=](vector::ConstantMaskOp op) -> bool {
+        return !xegpu::getTemporaryLayout(op->getOpResult(0));
+      });
   // vector::TransposeOp is legal only if it has no result layout attribute.
   target.addDynamicallyLegalOp<vector::TransposeOp>(
       [=](vector::TransposeOp op) -> bool {
@@ -1135,6 +1225,8 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
                SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
                SgToWiLoadGather, SgToWiStoreScatter, SgToWiVectorReduction,
                SgToWiMultiDimReduction, SgToWiLoadMatrix, SgToWiStoreMatrix,
-               SgToWiConvertLayout, SgToWiVectorTranspose, SgToWiVectorBitcast>(
-      typeConverter, patterns.getContext());
+               SgToWiConvertLayout, SgToWiVectorTranspose, SgToWiVectorBitcast,
+               SgToWiCreateMask<vector::CreateMaskOp>,
+               SgToWiCreateMask<vector::ConstantMaskOp>>(typeConverter,
+                                                         patterns.getContext());
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 487d2a6dd7dfe..b6f579a6869f2 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -497,6 +497,54 @@ gpu.func @vector_bitcast() {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @create_mask_1d
+//  CHECK-SAME: (%[[M0:.*]]: index)
+//       CHECK:   %[[LANE:.*]] = gpu.lane_id
+//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+//       CHECK:   gpu.return
+gpu.func @create_mask_1d(%m0: index) {
+  %mask = vector.create_mask %m0
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : vector<16xi1>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @constant_mask_1d
+//       CHECK:   %[[LANE:.*]] = gpu.lane_id
+//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+//       CHECK:   gpu.return
+gpu.func @constant_mask_1d() {
+  %mask = vector.constant_mask [4]
+    {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+    : vector<16xi1>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @create_mask_2d
+//  CHECK-SAME: (%[[M0:.*]]: index, %[[M1:.*]]: index)
+//       CHECK:   %[[LANE:.*]] = gpu.lane_id
+//       CHECK:   vector.create_mask {{.*}} : vector<1x2xi1>
+//       CHECK:   gpu.return
+gpu.func @create_mask_2d(%m0: index, %m1: index) {
+  %mask = vector.create_mask %m0, %m1
+    {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
+    : vector<8x4xi1>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @constant_mask_2d
+//       CHECK:   %[[LANE:.*]] = gpu.lane_id
+//       CHECK:   vector.create_mask {{.*}} : vector<1x2xi1>
+//       CHECK:   gpu.return
+gpu.func @constant_mask_2d() {
+  %mask = vector.constant_mask [2, 3]
+    {layout_result_0 = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>}
+    : vector<8x4xi1>
+  gpu.return
+}
+
 // CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible
 // CHECK-NOT: xegpu.convert_layout
 gpu.func @convert_layout_removed_when_compatible() {

>From 5bbf92bdf37b4b253fa4f0146d93292fa2647f08 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 18 Mar 2026 16:16:06 +0000
Subject: [PATCH 3/9] Clean up

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 21 +++----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index fe5a143b11fa6..47c29d9d8ead6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -1200,24 +1200,9 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       [=](vector::MultiDimReductionOp op) -> bool {
         return !isValidSubgroupMultiReductionOp(op);
       });
-  // vector::CreateMaskOp is legal only if it has no result layout attribute.
-  target.addDynamicallyLegalOp<vector::CreateMaskOp>(
-      [=](vector::CreateMaskOp op) -> bool {
-        return !xegpu::getTemporaryLayout(op->getOpResult(0));
-      });
-  // vector::ConstantMaskOp is legal only if it has no result layout attribute.
-  target.addDynamicallyLegalOp<vector::ConstantMaskOp>(
-      [=](vector::ConstantMaskOp op) -> bool {
-        return !xegpu::getTemporaryLayout(op->getOpResult(0));
-      });
-  // vector::TransposeOp is legal only if it has no result layout attribute.
-  target.addDynamicallyLegalOp<vector::TransposeOp>(
-      [=](vector::TransposeOp op) -> bool {
-        return !xegpu::getTemporaryLayout(op->getOpResult(0));
-      });
-  // vector::BitCastOp is legal only if it has no result layout attribute.
-  target.addDynamicallyLegalOp<vector::BitCastOp>(
-      [=](vector::BitCastOp op) -> bool {
+  target.addDynamicallyLegalOp<vector::CreateMaskOp, vector::ConstantMaskOp,
+                               vector::TransposeOp, vector::BitCastOp>(
+      [=](Operation *op) -> bool {
         return !xegpu::getTemporaryLayout(op->getOpResult(0));
       });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });

>From ac0d42c7731953a54a678e4c703e820011a18671 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 23 Mar 2026 22:45:28 +0000
Subject: [PATCH 4/9] Remove operand layouts

---
 mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a290c3c869c72..c876a844e8ae2 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -472,7 +472,6 @@ gpu.func @vector_transpose() {
     : () -> (vector<16x2xf32>)
   %transpose = vector.transpose %cst, [1, 0]
     {
-      layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>,
       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
     }
     : vector<16x2xf32> to vector<2x16xf32>
@@ -490,7 +489,6 @@ gpu.func @vector_bitcast() {
     : () -> (vector<4x32xi8>)
   %bitcast = vector.bitcast %cst
     {
-      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
     }
     : vector<4x32xi8> to vector<4x16xi16>

>From d03332150c713f98e6ed4c97e2e1e2aa1be60efb Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Tue, 24 Mar 2026 20:33:30 +0000
Subject: [PATCH 5/9] Feedback

---
 .../Transforms/XeGPUSgToWiDistributeExperimental.cpp   | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 10989b58258f8..62ba8e3500887 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -689,7 +689,6 @@ struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
 };
 
 /// Distributes a subgroup-level vector.transpose op to workitem-level.
-/// Only 2D transposes are supported.
 struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
   using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
 
@@ -704,19 +703,12 @@ struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
       return rewriter.notifyMatchFailure(
           op, "the source or result vector of the transpose op lacks layout "
               "attribute");
-    int64_t sourceRank = op.getSourceVectorType().getRank();
-    int64_t resultRank = op.getResultVectorType().getRank();
-    // Only 2D transposes are supported.
-    if (sourceRank != 2 || resultRank != 2)
-      return rewriter.notifyMatchFailure(
-          op, "the source or result vector of the transpose op "
-              "does not have 2D layout");
     ArrayRef<int64_t> perm = op.getPermutation();
     // Result layout must be a transpose of source layout.
     if (!resultLayout.isTransposeOf(sourceLayout, perm,
                                     xegpu::LayoutKind::Lane))
       return rewriter.notifyMatchFailure(
-          op, "the source or result vector layouts must be 2D transposes of "
+          op, "the source or result vector layouts must be transposes of "
               "each other");
     FailureOr<VectorType> distributedResultTypeOrFailure =
         getDistVecTypeBasedOnLaneLayout(resultLayout, op.getResultVectorType());

>From 478124b9a0f91235e26fa703252a95ff53afc44e Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Fri, 27 Mar 2026 16:27:46 +0000
Subject: [PATCH 6/9] Address Feedback

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 18 +++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 43 +++++++++++++++----
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 62ba8e3500887..c0bc9d1857db8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -754,10 +754,24 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
 /// Distributes a subgroup-level vector.create_mask or vector.constant_mask op
 /// to workitem-level. Each lane computes its own mask bounds based on its
 /// lane coordinates. For each dimension i, the new mask bound is:
-///   new_bound[i] = original_bound[i] - lane_coord[i] * dist_shape[i]
-/// vector.create_mask implicitly clamps to [0, vector_size].
+///   new_bound[i] = original_bound[i] - lane_coord[i] * wi_elem_count[i]
+/// where `wi_elem_count[i]` is the number of elements each workitem holds
+/// along dimension i (i.e., `distType.getShape()[i]`).
+/// `vector.create_mask` implicitly clamps the bounds to
+/// `[0, wi_elem_count[i]]`, so no explicit clamping is needed.
 /// For constant_mask, the constant dim sizes are first materialized as
 /// Values, then the same logic applies, producing a vector.create_mask.
+///
+/// Example:
+///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+///   %mask = vector.create_mask %m0 : vector<16xi1>
+/// For lane k, wi_elem_count = [1], so:
+///   new_bound = m0 - k * 1
+/// Distributed to:
+///   %lane = gpu.lane_id
+///   %new_bound = affine.apply affine_map<()[s0, s1] -> (-s0 + s1)>
+///                  ()[%lane, %m0]
+///   %mask = vector.create_mask %new_bound : vector<1xi1>
 template <typename OpType,
           typename = std::enable_if_t<llvm::is_one_of<
               OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index c876a844e8ae2..07192e32f96aa 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,6 +2,12 @@
 // RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
 // RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
 
+// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (-s0 + s1)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (-s0 + 4)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1] -> (s0 * -2 + s1)>
+// CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0] -> (-s0 + 2)>
+// CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0] -> (s0 * -2 + 3)>
+
 gpu.module @xevm_module {
 // CHECK-LABEL: gpu.func @create_nd_tdesc
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
@@ -497,8 +503,10 @@ gpu.func @vector_bitcast() {
 
 // CHECK-LABEL: gpu.func @create_mask_1d
 //  CHECK-SAME: (%[[M0:.*]]: index)
-//       CHECK:   %[[LANE:.*]] = gpu.lane_id
-//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//       CHECK:   %[[LANE_ID:.*]] = arith.remui %[[LANE]], %[[C16]] : index
+//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply #[[$MAP]]()[%[[LANE_ID]], %[[M0]]]
 //       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
 //       CHECK:   gpu.return
 gpu.func @create_mask_1d(%m0: index) {
@@ -509,8 +517,11 @@ gpu.func @create_mask_1d(%m0: index) {
 }
 
 // CHECK-LABEL: gpu.func @constant_mask_1d
-//       CHECK:   %[[LANE:.*]] = gpu.lane_id
-//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply
+//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//       CHECK:   %[[LANE_ID:.*]] = arith.remui %[[LANE]], %[[C16]] : index
+//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply #[[$MAP1]]()[%[[LANE_ID]]]
 //       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
 //       CHECK:   gpu.return
 gpu.func @constant_mask_1d() {
@@ -522,8 +533,15 @@ gpu.func @constant_mask_1d() {
 
 // CHECK-LABEL: gpu.func @create_mask_2d
 //  CHECK-SAME: (%[[M0:.*]]: index, %[[M1:.*]]: index)
-//       CHECK:   %[[LANE:.*]] = gpu.lane_id
-//       CHECK:   vector.create_mask {{.*}} : vector<1x2xi1>
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//       CHECK:   %[[REM1:.*]] = arith.remui %[[LANE]], %[[C2]] : index
+//       CHECK:   %[[DIV:.*]] = arith.divui %[[LANE]], %[[C2]] : index
+//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[REM2:.*]] = arith.remui %[[DIV]], %[[C8]] : index
+//       CHECK:   %[[BOUND0:.*]] = affine.apply #[[$MAP]]()[%[[REM2]], %[[M0]]]
+//       CHECK:   %[[BOUND1:.*]] = affine.apply #[[$MAP2]]()[%[[REM1]], %[[M1]]]
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[BOUND0]], %[[BOUND1]] : vector<1x2xi1>
 //       CHECK:   gpu.return
 gpu.func @create_mask_2d(%m0: index, %m1: index) {
   %mask = vector.create_mask %m0, %m1
@@ -533,8 +551,17 @@ gpu.func @create_mask_2d(%m0: index, %m1: index) {
 }
 
 // CHECK-LABEL: gpu.func @constant_mask_2d
-//       CHECK:   %[[LANE:.*]] = gpu.lane_id
-//       CHECK:   vector.create_mask {{.*}} : vector<1x2xi1>
+//   CHECK-DAG:   %[[C2_CONST:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//       CHECK:   %[[REM1:.*]] = arith.remui %[[LANE]], %[[C2]] : index
+//       CHECK:   %[[DIV:.*]] = arith.divui %[[LANE]], %[[C2]] : index
+//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
+//       CHECK:   %[[REM2:.*]] = arith.remui %[[DIV]], %[[C8]] : index
+//       CHECK:   %[[BOUND0:.*]] = affine.apply #[[$MAP3]]()[%[[REM2]]]
+//       CHECK:   %[[BOUND1:.*]] = affine.apply #[[$MAP4]]()[%[[REM1]]]
+//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[BOUND0]], %[[BOUND1]] : vector<1x2xi1>
 //       CHECK:   gpu.return
 gpu.func @constant_mask_2d() {
   %mask = vector.constant_mask [2, 3]

>From 066dfac73b3ca9756b27841ea7f0605c0c14b9db Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Sat, 28 Mar 2026 16:33:13 +0000
Subject: [PATCH 7/9] Temp commit

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 102 ++++++++++--------
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |  60 +++++------
 2 files changed, 85 insertions(+), 77 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index c0bc9d1857db8..dc6ae8259b11d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -752,26 +751,28 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
 };
 
 /// Distributes a subgroup-level vector.create_mask or vector.constant_mask op
-/// to workitem-level. Each lane computes its own mask bounds based on its
-/// lane coordinates. For each dimension i, the new mask bound is:
-///   new_bound[i] = original_bound[i] - lane_coord[i] * wi_elem_count[i]
-/// where `wi_elem_count[i]` is the number of elements each workitem holds
-/// along dimension i (i.e., `distType.getShape()[i]`).
-/// `vector.create_mask` implicitly clamps the bounds to
-/// `[0, wi_elem_count[i]]`, so no explicit clamping is needed.
-/// For constant_mask, the constant dim sizes are first materialized as
-/// Values, then the same logic applies, producing a vector.create_mask.
+/// to workitem-level. Uses `computeDistributedCoords()` to obtain the
+/// coordinates each workitem owns, then compares each coordinate against the
+/// original mask bounds using `arith.cmpi slt`. The per-element boolean
+/// results are assembled into the distributed mask vector.
 ///
-/// Example:
+/// For multi-dimensional masks, the element is in-bounds when ALL dimensions
+/// satisfy `coord[i] < bound[i]`.
+///
+/// Example (1D):
 ///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
 ///   %mask = vector.create_mask %m0 : vector<16xi1>
-/// For lane k, wi_elem_count = [1], so:
-///   new_bound = m0 - k * 1
-/// Distributed to:
-///   %lane = gpu.lane_id
-///   %new_bound = affine.apply affine_map<()[s0, s1] -> (-s0 + s1)>
-///                  ()[%lane, %m0]
-///   %mask = vector.create_mask %new_bound : vector<1xi1>
+/// For lane k, computeDistributedCoords gives coord = [k], so:
+///   %in_bounds = arith.cmpi slt, %k, %m0  →  i1
+///   %mask = vector.broadcast %in_bounds : i1 to vector<1xi1>
+///
+/// Example (2D):
+///   layout = #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>
+///   %mask = vector.create_mask %m0, %m1 : vector<8x4xi1>
+/// Each WI owns a 1x2 slice. computeDistributedCoords returns 2 coords:
+///   [[r0, c0], [r0, c1]]
+/// For each coord: in_bounds = (r < m0) && (c < m1)
+///   %mask = vector.from_elements %bit0, %bit1 : vector<1x2xi1>
 template <typename OpType,
           typename = std::enable_if_t<llvm::is_one_of<
               OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>
@@ -797,49 +798,58 @@ struct SgToWiCreateMask : public OpConversionPattern<OpType> {
     VectorType distType = distTypeOrFailure.value();
     Location loc = op.getLoc();
 
-    // Materialize the original mask operands as Values.
-    SmallVector<Value> origOperands;
+    // Materialize the original mask bounds as Values.
+    SmallVector<Value> origBounds;
     if constexpr (std::is_same_v<OpType, vector::CreateMaskOp>) {
-      origOperands.append(op.getOperands().begin(), op.getOperands().end());
+      origBounds.append(op.getOperands().begin(), op.getOperands().end());
     } else {
       auto dimSizes = op.getMaskDimSizesAttr().asArrayRef();
       for (auto dimSize : dimSizes)
-        origOperands.push_back(
+        origBounds.push_back(
             arith::ConstantIndexOp::create(rewriter, loc, dimSize).getResult());
     }
 
     ArrayRef<int64_t> origShape = origType.getShape();
-    ArrayRef<int64_t> distShape = distType.getShape();
 
-    // Delinearize lane ID using the layout.
+    // Use computeDistributedCoords to get the coordinates each WI owns.
     Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
                                          /*upperBound=*/mlir::IntegerAttr());
-    auto maybeIds = layout.delinearizeId(rewriter, loc, laneId);
-    if (failed(maybeIds))
+    auto maybeCoordsVec =
+        layout.computeDistributedCoords(rewriter, loc, laneId, origShape);
+    if (failed(maybeCoordsVec))
       return rewriter.notifyMatchFailure(
-          op, "failed to delinearize lane ID from layout");
-    SmallVector<Value> laneIds = maybeIds.value();
-
-    // Compute new mask operands.
-    AffineExpr s0, s1;
-    bindSymbols(rewriter.getContext(), s0, s1);
-    SmallVector<Value> newOperands;
-    for (int i = 0, e = distShape.size(); i < e; ++i) {
-      if (origShape[i] == distShape[i]) {
-        // Dimension is not distributed, keep the original operand.
-        newOperands.push_back(origOperands[i]);
-      } else {
-        // new_bound = original_bound - lane_coord * dist_size
-        Value maskDimIdx = affine::makeComposedAffineApply(
-            rewriter, loc, s1 - s0 * distShape[i],
-            {laneIds[i], origOperands[i]});
-        newOperands.push_back(maskDimIdx);
+          op, "failed to compute distributed coordinates from layout");
+
+    SmallVector<SmallVector<Value>> coordsVec = maybeCoordsVec.value();
+    int64_t numElements = distType.getNumElements();
+    assert(static_cast<int64_t>(coordsVec.size()) == numElements &&
+           "number of coordinate sets must match number of distributed "
+           "elements");
+
+    // For each element, compare all coordinates against bounds.
+    Value trueVal =
+        arith::ConstantIntOp::create(rewriter, loc, /*value=*/1, /*width=*/1);
+    SmallVector<Value> maskBits;
+    for (auto &coords : coordsVec) {
+      Value inBounds = trueVal;
+      for (size_t i = 0; i < coords.size(); ++i) {
+        Value cmp = arith::CmpIOp::create(
+            rewriter, loc, arith::CmpIPredicate::slt, coords[i], origBounds[i]);
+        inBounds = arith::AndIOp::create(rewriter, loc, inBounds, cmp);
       }
+      maskBits.push_back(inBounds);
     }
 
-    auto newMask =
-        vector::CreateMaskOp::create(rewriter, loc, distType, newOperands);
-    rewriter.replaceOp(op, newMask.getResult());
+    // Build the distributed mask vector.
+    Value result;
+    if (numElements == 1) {
+      result =
+          vector::BroadcastOp::create(rewriter, loc, distType, maskBits[0]);
+    } else {
+      result =
+          vector::FromElementsOp::create(rewriter, loc, distType, maskBits);
+    }
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 07192e32f96aa..09e889cd63ea3 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,12 +2,6 @@
 // RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
 // RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
 
-// CHECK-DAG: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (-s0 + s1)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (-s0 + 4)>
-// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1] -> (s0 * -2 + s1)>
-// CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0] -> (-s0 + 2)>
-// CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0] -> (s0 * -2 + 3)>
-
 gpu.module @xevm_module {
 // CHECK-LABEL: gpu.func @create_nd_tdesc
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
@@ -504,10 +498,10 @@ gpu.func @vector_bitcast() {
 // CHECK-LABEL: gpu.func @create_mask_1d
 //  CHECK-SAME: (%[[M0:.*]]: index)
 //   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
-//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-//       CHECK:   %[[LANE_ID:.*]] = arith.remui %[[LANE]], %[[C16]] : index
-//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply #[[$MAP]]()[%[[LANE_ID]], %[[M0]]]
-//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+//   CHECK-DAG:   %[[TRUE:.*]] = arith.constant true
+//       CHECK:   %[[CMP:.*]] = arith.cmpi slt, %{{.*}}, %[[M0]] : index
+//       CHECK:   %[[AND:.*]] = arith.andi %[[TRUE]], %[[CMP]] : i1
+//       CHECK:   %[[MASK:.*]] = vector.broadcast %[[AND]] : i1 to vector<1xi1>
 //       CHECK:   gpu.return
 gpu.func @create_mask_1d(%m0: index) {
   %mask = vector.create_mask %m0
@@ -519,10 +513,10 @@ gpu.func @create_mask_1d(%m0: index) {
 // CHECK-LABEL: gpu.func @constant_mask_1d
 //   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
 //   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
-//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-//       CHECK:   %[[LANE_ID:.*]] = arith.remui %[[LANE]], %[[C16]] : index
-//       CHECK:   %[[NEW_BOUND:.*]] = affine.apply #[[$MAP1]]()[%[[LANE_ID]]]
-//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[NEW_BOUND]] : vector<1xi1>
+//   CHECK-DAG:   %[[TRUE:.*]] = arith.constant true
+//       CHECK:   %[[CMP:.*]] = arith.cmpi slt, %{{.*}}, %[[C4]] : index
+//       CHECK:   %[[AND:.*]] = arith.andi %[[TRUE]], %[[CMP]] : i1
+//       CHECK:   %[[MASK:.*]] = vector.broadcast %[[AND]] : i1 to vector<1xi1>
 //       CHECK:   gpu.return
 gpu.func @constant_mask_1d() {
   %mask = vector.constant_mask [4]
@@ -534,14 +528,16 @@ gpu.func @constant_mask_1d() {
 // CHECK-LABEL: gpu.func @create_mask_2d
 //  CHECK-SAME: (%[[M0:.*]]: index, %[[M1:.*]]: index)
 //   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
-//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK:   %[[REM1:.*]] = arith.remui %[[LANE]], %[[C2]] : index
-//       CHECK:   %[[DIV:.*]] = arith.divui %[[LANE]], %[[C2]] : index
-//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
-//       CHECK:   %[[REM2:.*]] = arith.remui %[[DIV]], %[[C8]] : index
-//       CHECK:   %[[BOUND0:.*]] = affine.apply #[[$MAP]]()[%[[REM2]], %[[M0]]]
-//       CHECK:   %[[BOUND1:.*]] = affine.apply #[[$MAP2]]()[%[[REM1]], %[[M1]]]
-//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[BOUND0]], %[[BOUND1]] : vector<1x2xi1>
+//   CHECK-DAG:   %[[TRUE:.*]] = arith.constant true
+//       CHECK:   %[[CMP_R0:.*]] = arith.cmpi slt, %{{.*}}, %[[M0]] : index
+//       CHECK:   %[[AND0:.*]] = arith.andi %[[TRUE]], %[[CMP_R0]] : i1
+//       CHECK:   %[[CMP_C0:.*]] = arith.cmpi slt, %{{.*}}, %[[M1]] : index
+//       CHECK:   %[[BIT0:.*]] = arith.andi %[[AND0]], %[[CMP_C0]] : i1
+//       CHECK:   %[[CMP_R1:.*]] = arith.cmpi slt, %{{.*}}, %[[M0]] : index
+//       CHECK:   %[[AND1:.*]] = arith.andi %[[TRUE]], %[[CMP_R1]] : i1
+//       CHECK:   %[[CMP_C1:.*]] = arith.cmpi slt, %{{.*}}, %[[M1]] : index
+//       CHECK:   %[[BIT1:.*]] = arith.andi %[[AND1]], %[[CMP_C1]] : i1
+//       CHECK:   %[[MASK:.*]] = vector.from_elements %[[BIT0]], %[[BIT1]] : vector<1x2xi1>
 //       CHECK:   gpu.return
 gpu.func @create_mask_2d(%m0: index, %m1: index) {
   %mask = vector.create_mask %m0, %m1
@@ -551,17 +547,19 @@ gpu.func @create_mask_2d(%m0: index, %m1: index) {
 }
 
 // CHECK-LABEL: gpu.func @constant_mask_2d
-//   CHECK-DAG:   %[[C2_CONST:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 //   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[LANE:.*]] = gpu.lane_id
-//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK:   %[[REM1:.*]] = arith.remui %[[LANE]], %[[C2]] : index
-//       CHECK:   %[[DIV:.*]] = arith.divui %[[LANE]], %[[C2]] : index
-//   CHECK-DAG:   %[[C8:.*]] = arith.constant 8 : index
-//       CHECK:   %[[REM2:.*]] = arith.remui %[[DIV]], %[[C8]] : index
-//       CHECK:   %[[BOUND0:.*]] = affine.apply #[[$MAP3]]()[%[[REM2]]]
-//       CHECK:   %[[BOUND1:.*]] = affine.apply #[[$MAP4]]()[%[[REM1]]]
-//       CHECK:   %[[MASK:.*]] = vector.create_mask %[[BOUND0]], %[[BOUND1]] : vector<1x2xi1>
+//   CHECK-DAG:   %[[TRUE:.*]] = arith.constant true
+//       CHECK:   %[[CMP_R0:.*]] = arith.cmpi slt, %{{.*}}, %[[C2]] : index
+//       CHECK:   %[[AND0:.*]] = arith.andi %[[TRUE]], %[[CMP_R0]] : i1
+//       CHECK:   %[[CMP_C0:.*]] = arith.cmpi slt, %{{.*}}, %[[C3]] : index
+//       CHECK:   %[[BIT0:.*]] = arith.andi %[[AND0]], %[[CMP_C0]] : i1
+//       CHECK:   %[[CMP_R1:.*]] = arith.cmpi slt, %{{.*}}, %[[C2]] : index
+//       CHECK:   %[[AND1:.*]] = arith.andi %[[TRUE]], %[[CMP_R1]] : i1
+//       CHECK:   %[[CMP_C1:.*]] = arith.cmpi slt, %{{.*}}, %[[C3]] : index
+//       CHECK:   %[[BIT1:.*]] = arith.andi %[[AND1]], %[[CMP_C1]] : i1
+//       CHECK:   %[[MASK:.*]] = vector.from_elements %[[BIT0]], %[[BIT1]] : vector<1x2xi1>
 //       CHECK:   gpu.return
 gpu.func @constant_mask_2d() {
   %mask = vector.constant_mask [2, 3]

>From 1cf3cb9162957b6047653375356304db2ff727fa Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Sun, 29 Mar 2026 21:40:55 +0000
Subject: [PATCH 8/9] Clean up

---
 .../Transforms/XeGPUSgToWiDistributeExperimental.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index fbecf00da4324..b17e1b46a4e18 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -1701,14 +1701,10 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       });
   target.addDynamicallyLegalOp<vector::CreateMaskOp, vector::ConstantMaskOp,
                                vector::TransposeOp, vector::BitCastOp,
-                               vector::ShapeCastOp, vector::StepOp>(
-      [=](Operation *op) -> bool {
-        return !xegpu::getTemporaryLayout(op->getOpResult(0));
-      });
-  target.addDynamicallyLegalOp<vector::BroadcastOp>(
-      [=](vector::BroadcastOp op) -> bool {
-        return !xegpu::getTemporaryLayout(op->getResult(0));
-      });
+                               vector::ShapeCastOp, vector::StepOp,
+                               vector::BroadcastOp>([=](Operation *op) -> bool {
+    return !xegpu::getTemporaryLayout(op->getOpResult(0));
+  });
   target.addDynamicallyLegalOp<vector::ExtractOp>(
       [=](vector::ExtractOp op) -> bool {
         if (!isa<VectorType>(op.getType()))

>From 0c5b7fe20675eada3d390df5b94256c270b0f9fb Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 30 Mar 2026 20:44:31 +0000
Subject: [PATCH 9/9] Fix comment

---
 .../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index b17e1b46a4e18..bb0467cdeb89b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -763,7 +763,7 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
 ///   layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
 ///   %mask = vector.create_mask %m0 : vector<16xi1>
 /// For lane k, computeDistributedCoords gives coord = [k], so:
-///   %in_bounds = arith.cmpi slt, %k, %m0  →  i1
+///   %in_bounds = arith.cmpi slt, %coord, %m0  →  i1
 ///   %mask = vector.broadcast %in_bounds : i1 to vector<1xi1>
 ///
 /// Example (2D):