[Mlir-commits] [mlir] [mlir][xegpu] Add `XeGPUSgToWiDistributeExperimental` pass. (PR #177492)

Thu Jan 29 09:45:56 PST 2026

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/177492

>From a258368e1ece17eb3d378aa356feef1530dccc01 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 12 Jan 2026 22:54:17 +0000
Subject: [PATCH 01/18] save work

---
 mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3ff7805263f0e..4a213cad69e68 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -114,4 +114,14 @@ def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
                            "vector::VectorDialect"];
 }
 
+def XeGPUSgToWiDistributeExperimental : Pass<"xegpu-sg-to-wi-distribute-experimental"> {
+  let summary = "Distribute XeGPU ops to work items";
+  let description = [{
+    The pass distributes subgroup level XeGPU ops to work item level XeGPU ops.
+  }];
+  let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
+                           "vector::VectorDialect", "index::IndexDialect"];
+}
+
+
 #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD

>From 1c0d3d6180ef14ce4e62cffdf28e80f5984cb481 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 14 Jan 2026 17:33:38 +0000
Subject: [PATCH 02/18] save work

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |  4 ++
 .../Dialect/XeGPU/Transforms/CMakeLists.txt   |  1 +
 .../XeGPUSgToWiDistributeExperimental.cpp     | 59 +++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 1776a209d0bf1..8f69b9e75f374 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -70,6 +70,10 @@ void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU workgroup to subgroup distribution into
 /// `patterns`.
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU subgroup to work-item distribution into
+/// `patterns`.
+void populateXeGPUSgToWiDistributeExperimentalPatterns(
+    RewritePatternSet &patterns);
 
 /// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
 /// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 29b645feab2c6..9bdde30ca5b89 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUBlocking.cpp
   XeGPUFoldAliasOps.cpp
+  XeGPUSgToWiDistributeExperimental.cpp
   XeGPUSubgroupDistribute.cpp
   XeGPUUnroll.cpp
   XeGPUWgToSgDistribute.cpp
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
new file mode 100644
index 0000000000000..d1ac34c69d4b5
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -0,0 +1,59 @@
+//===- XeGPUSgToWiDistributeExperimental.cpp - XeGPU SG to WI Pass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUSGTOWIDISTRIBUTEEXPERIMENTAL
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+
+struct XeGPUSgToWiDistributeExperimentalPass
+    : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
+          XeGPUSgToWiDistributeExperimentalPass> {
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
+  // Recover layouts.
+  Operation *op = getOperation();
+  if (!xegpu::recoverTemporaryLayouts(op)) {
+    signalPassFailure();
+    return;
+  }
+
+  // Define conversion target
+  ConversionTarget target(getContext());
+  target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
+                         vector::VectorDialect>();
+  target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
+      [](Operation *op) { return true; });
+
+  // Define type converter
+  TypeConverter typeConverter;
+  typeConverter.addConversion([](Type type) { return type; });
+}
+
+void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
+    RewritePatternSet &patterns) {
+  // TODO: Implement pattern population logic
+}

>From 9fb97fe24a11c92415a691d6c4a1fbeec0ccc8e5 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 14 Jan 2026 22:21:08 +0000
Subject: [PATCH 03/18] save work

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |  3 +-
 .../XeGPUSgToWiDistributeExperimental.cpp     | 27 +++++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 20 +++++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 54 +++++++++++++++++++
 4 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index e4c8e2356b191..898af0ec14738 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H
 
 #include "mlir/IR/Operation.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/LogicalResult.h"
 
@@ -73,7 +74,7 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU subgroup to work-item distribution into
 /// `patterns`.
 void populateXeGPUSgToWiDistributeExperimentalPatterns(
-    RewritePatternSet &patterns);
+    RewritePatternSet &patterns, TypeConverter &typeConverter);
 
 /// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
 /// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index d1ac34c69d4b5..2f73b08aa45ac 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -13,6 +13,8 @@
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/LogicalResult.h"
 
 namespace mlir {
 namespace xegpu {
@@ -22,9 +24,30 @@ namespace xegpu {
 } // namespace mlir
 
 using namespace mlir;
+using namespace mlir::xegpu;
 
 namespace {
 
+struct CreateNdDescOpPattern
+    : public OpConversionPattern<xegpu::CreateNdDescOp> {
+  using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(xegpu::CreateNdDescOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto resultType = op.getType();
+    // If no layout, nothing to do.
+    if (!resultType.getLayout())
+      return failure();
+
+    auto newOp = xegpu::CreateNdDescOp::create(
+        rewriter, op.getLoc(), resultType.dropLayouts(), op->getOperands(),
+        op->getAttrs());
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -54,6 +77,6 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 }
 
 void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
-    RewritePatternSet &patterns) {
-  // TODO: Implement pattern population logic
+    RewritePatternSet &patterns, TypeConverter &typeConverter) {
+  patterns.add<CreateNdDescOpPattern>(typeConverter, patterns.getContext());
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
new file mode 100644
index 0000000000000..2d6e7015e3b39
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -0,0 +1,20 @@
+
+// RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' \
+// --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+
+gpu.module @xevm_module {
+gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16>
+    -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
+gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape : [256, 256], strides : [256, 1] : ui64
+    -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
+}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 1a1520dfa975d..fcae5a3a5dd06 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -247,6 +247,59 @@ struct TestXeGPUSGDistribute
   }
 };
 
+struct TestXeGPUSgToWiDistributeExperimental
+    : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
+                         OperationPass<gpu::GPUModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestXeGPUSgToWiDistributeExperimental)
+
+  StringRef getArgument() const final {
+    return "test-xegpu-sg-to-wi-distribute-experimental";
+  }
+
+  StringRef getDescription() const final {
+    return "Test the experimental implementation of XeGPU Subgroup to "
+           "Work-item Distribution";
+  }
+
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<index::IndexDialect>();
+    registry.insert<gpu::GPUDialect>();
+  }
+
+  TestXeGPUSgToWiDistributeExperimental() = default;
+  TestXeGPUSgToWiDistributeExperimental(
+      const TestXeGPUSgToWiDistributeExperimental &pass) = default;
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+
+    TypeConverter typeConverter;
+    // After distribution, there are no layouts associated with the tensor_desc
+    // types.
+    typeConverter.addConversion(
+        [](xegpu::TensorDescType type) { return type.dropLayouts(); });
+    typeConverter.addConversion([](Type type) { return type; });
+
+    ConversionTarget target(*ctx);
+    // CreateNdDescOp is legal only if its result type has no layout attribute.
+    target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+        [&](xegpu::CreateNdDescOp op) {
+          return !op.getType().getLayoutAttr();
+        });
+    RewritePatternSet patterns(ctx);
+    xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
+                                                             typeConverter);
+    target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
+
+    (void)applyPartialConversion(getOperation(), target, std::move(patterns));
+  }
+};
+
 struct TestXeGPUMoveFuncBodyToWarpOp
     : public PassWrapper<TestXeGPUMoveFuncBodyToWarpOp,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -341,6 +394,7 @@ void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
   PassRegistration<TestXeGPULayoutInterface>();
   PassRegistration<TestXeGPUSGDistribute>();
+  PassRegistration<TestXeGPUSgToWiDistributeExperimental>();
   PassRegistration<TestXeGPUMoveFuncBodyToWarpOp>();
 }
 } // namespace test

>From 2f818376693291202f89a3e4ae36042c9d3173bb Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 15 Jan 2026 21:12:55 +0000
Subject: [PATCH 04/18] save work

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 69 ++++++++++++++-----
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 12 +++-
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 53 ++++++++++++--
 3 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 2f73b08aa45ac..25df0f341093b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -24,7 +24,6 @@ namespace xegpu {
 } // namespace mlir
 
 using namespace mlir;
-using namespace mlir::xegpu;
 
 namespace {
 
@@ -35,19 +34,50 @@ struct CreateNdDescOpPattern
   LogicalResult
   matchAndRewrite(xegpu::CreateNdDescOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto resultType = op.getType();
+    xegpu::TensorDescType resultType = op.getType();
     // If no layout, nothing to do.
     if (!resultType.getLayout())
       return failure();
 
     auto newOp = xegpu::CreateNdDescOp::create(
-        rewriter, op.getLoc(), resultType.dropLayouts(), op->getOperands(),
+        rewriter, op.getLoc(), resultType.dropLayouts(), op.getOperands(),
         op->getAttrs());
     rewriter.replaceOp(op, newOp.getResult());
     return success();
   }
 };
 
+struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
+  using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(xegpu::LoadNdOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
+    // If no layout, nothing to do.
+    if (!layout)
+      return failure();
+    // Check if the layout attached to the tensor descriptor is same as the
+    // anchor layout. Otherwise, this is a conflict.
+    if (op.getTensorDescType().getLayout() != layout)
+      return rewriter.notifyMatchFailure(
+          op, "conflicting layout attributes on tensor descriptor and anchor");
+    auto distributedVectorTypeOrFailure =
+        xegpu::getDistributedVectorType(op.getTensorDescType());
+    if (failed(distributedVectorTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute distributed vector type from the layout");
+    llvm::errs() << "adaptor tensor desc: " << adaptor.getTensorDesc() << "\n";
+    auto newOp = xegpu::LoadNdOp::create(
+        rewriter, op.getLoc(), distributedVectorTypeOrFailure.value(),
+        adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
+        op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
+        op.getL3HintAttr(), /**layout**/ nullptr);
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -57,26 +87,27 @@ struct XeGPUSgToWiDistributeExperimentalPass
 } // namespace
 
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
-  // Recover layouts.
-  Operation *op = getOperation();
-  if (!xegpu::recoverTemporaryLayouts(op)) {
-    signalPassFailure();
-    return;
-  }
+  // // Recover layouts.
+  // Operation *op = getOperation();
+  // if (!xegpu::recoverTemporaryLayouts(op)) {
+  //   signalPassFailure();
+  //   return;
+  // }
 
-  // Define conversion target
-  ConversionTarget target(getContext());
-  target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
-                         vector::VectorDialect>();
-  target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
-      [](Operation *op) { return true; });
+  // // Define conversion target
+  // ConversionTarget target(getContext());
+  // target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
+  //                        vector::VectorDialect>();
+  // target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
+  //     [](Operation *op) { return true; });
 
-  // Define type converter
-  TypeConverter typeConverter;
-  typeConverter.addConversion([](Type type) { return type; });
+  // // Define type converter
+  // TypeConverter typeConverter;
+  // typeConverter.addConversion([](Type type) { return type; });
 }
 
 void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
     RewritePatternSet &patterns, TypeConverter &typeConverter) {
-  patterns.add<CreateNdDescOpPattern>(typeConverter, patterns.getContext());
+  patterns.add<CreateNdDescOpPattern, LoadNdOpPattern>(typeConverter,
+                                                       patterns.getContext());
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 2d6e7015e3b39..df21e4a05bfca 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -5,16 +5,24 @@
 gpu.module @xevm_module {
 gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
   %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16>
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16>
     -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
 gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
   %c0 = arith.constant 0 : index
-  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape : [256, 256], strides : [256, 1] : ui64
+  %0 = xegpu.create_nd_tdesc %arg0, shape : [256, 256], strides : [256, 1] : ui64
     -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 
+gpu.func @load_nd() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+  gpu.return
+}
+
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index fcae5a3a5dd06..093e37153cd74 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -12,10 +12,13 @@
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <optional>
 
 using namespace mlir;
 using namespace mlir::xegpu;
@@ -279,11 +282,45 @@ struct TestXeGPUSgToWiDistributeExperimental
     MLIRContext *ctx = &getContext();
 
     TypeConverter typeConverter;
-    // After distribution, there are no layouts associated with the tensor_desc
-    // types.
-    typeConverter.addConversion(
-        [](xegpu::TensorDescType type) { return type.dropLayouts(); });
     typeConverter.addConversion([](Type type) { return type; });
+    typeConverter.addConversion([](TensorDescType type) -> Type {
+      if (type.getLayoutAttr()) {
+        return type.dropLayouts();
+      }
+      return type;
+    });
+    auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+                               mlir::ValueRange inputs,
+                               mlir::Location loc) -> mlir::Value {
+      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+          .getResult(0);
+    };
+    // Define a vector materialization cast. If the input and output have same
+    // number of elements, perform a shape cast. Otherwise, use
+    // UnrealizedConversionCastOp to handle the conversion.
+    auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
+                                        ValueRange inputs,
+                                        Location loc) -> Value {
+      if (inputs.size() != 1)
+        return {};
+      auto input = inputs.front();
+      auto inputVecTy = dyn_cast<VectorType>(input.getType());
+      auto targetVecTy = dyn_cast<VectorType>(type);
+      if (inputVecTy && targetVecTy) {
+        if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
+          return vector::ShapeCastOp::create(builder, loc, targetVecTy, input)
+              .getResult();
+        }
+        return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+            .getResult(0);
+      }
+
+      return {};
+    };
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
+    typeConverter.addSourceMaterialization(vectorMaterializationCast);
+    typeConverter.addTargetMaterialization(vectorMaterializationCast);
 
     ConversionTarget target(*ctx);
     // CreateNdDescOp is legal only if its result type has no layout attribute.
@@ -291,6 +328,14 @@ struct TestXeGPUSgToWiDistributeExperimental
         [&](xegpu::CreateNdDescOp op) {
           return !op.getType().getLayoutAttr();
         });
+    // Any anchor XeGPU op is legal only if it has no anchor layout.
+    target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
+      auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
+      if (!anchorOp)
+        return true;
+      return !anchorOp.getAnchorLayout();
+    });
+
     RewritePatternSet patterns(ctx);
     xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
                                                              typeConverter);

>From 8784a2628f417fac71f19386ad6f2926055caa99 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 15 Jan 2026 23:23:40 +0000
Subject: [PATCH 05/18] save work

---
 .../XeGPUSgToWiDistributeExperimental.cpp     |  39 ++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |  11 ++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 104 ++++++++++++++----
 3 files changed, 127 insertions(+), 27 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 25df0f341093b..31e8b070b0e8b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 namespace xegpu {
@@ -67,7 +68,6 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
     if (failed(distributedVectorTypeOrFailure))
       return rewriter.notifyMatchFailure(
           op, "unable to compute distributed vector type from the layout");
-    llvm::errs() << "adaptor tensor desc: " << adaptor.getTensorDesc() << "\n";
     auto newOp = xegpu::LoadNdOp::create(
         rewriter, op.getLoc(), distributedVectorTypeOrFailure.value(),
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
@@ -78,6 +78,39 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
   }
 };
 
+struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
+  using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(xegpu::StoreNdOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
+    // If no layout, nothing to do.
+    if (!layout)
+      return failure();
+    // Check if the layout attached to the tensor descriptor and value layout is
+    // same as the anchor layout. Otherwise, this is a conflict.
+    if (op.getTensorDescType().getLayout() != layout)
+      return rewriter.notifyMatchFailure(
+          op, "conflicting layout attributes on tensor descriptor and anchor");
+    auto valueLayout = xegpu::getDistributeLayoutAttr(op->getOpOperand(0));
+    if (valueLayout != layout)
+      return rewriter.notifyMatchFailure(
+          op, "conflicting layout attributes on value and anchor");
+    auto distributedVectorTypeOrFailure =
+        xegpu::getDistributedVectorType(op.getTensorDescType());
+    if (failed(distributedVectorTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute distributed vector type from the layout");
+    xegpu::StoreNdOp::create(rewriter, op.getLoc(), adaptor.getValue(),
+                             adaptor.getTensorDesc(), op.getMixedOffsets(),
+                             op.getL1HintAttr(), op.getL2HintAttr(),
+                             op.getL3HintAttr(), /**layout**/ nullptr);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -108,6 +141,6 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
 void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
     RewritePatternSet &patterns, TypeConverter &typeConverter) {
-  patterns.add<CreateNdDescOpPattern, LoadNdOpPattern>(typeConverter,
-                                                       patterns.getContext());
+  patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern>(
+      typeConverter, patterns.getContext());
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index df21e4a05bfca..a8a654f3b4759 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -25,4 +25,15 @@ gpu.func @load_nd() {
   gpu.return
 }
 
+gpu.func @store_nd() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+  xegpu.store_nd %2, %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 093e37153cd74..6a6b5f6c6a856 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -250,6 +250,35 @@ struct TestXeGPUSGDistribute
   }
 };
 
+static FailureOr<VectorType>
+getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
+                                VectorType originalType) {
+  if (!layout)
+    return failure();
+  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
+         "Expecting a valid layout.");
+  SmallVector<int64_t> effectiveLaneLayout =
+      layout.getEffectiveLaneLayoutAsInt();
+  assert(static_cast<size_t>(originalType.getRank()) >=
+             effectiveLaneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  // Only distribute the last `laneLayout.size()` dimensions. The remaining
+  // dimensions are not distributed.
+  unsigned distributionStart =
+      originalType.getRank() - effectiveLaneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart)
+      continue;
+    // Check if the dimension can be distributed evenly.
+    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+  }
+  return VectorType::get(distributedShape, originalType.getElementType());
+}
+
 struct TestXeGPUSgToWiDistributeExperimental
     : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -289,38 +318,65 @@ struct TestXeGPUSgToWiDistributeExperimental
       }
       return type;
     });
+    typeConverter.addConversion([](Value v) -> std::optional<Type> {
+      auto type = v.getType();
+      auto layout = xegpu::getDistributeLayoutAttr(v);
+      // If no valid layout, nothing to do.
+      if (!layout || !layout.isForSubgroup())
+        return std::nullopt;
+      Operation *op = v.getDefiningOp();
+      if (isa<LoadNdOp>(op)) {
+        auto loadNdOp = cast<LoadNdOp>(op);
+        layout = loadNdOp.getAnchorLayout();
+        auto newTyOrFailure =
+            getDistributedVectorType(loadNdOp.getTensorDescType());
+        if (succeeded(newTyOrFailure))
+          return *newTyOrFailure;
+        return std::nullopt;
+      }
+      // For other vector types, distribute based on the lane layout.
+      if (isa<VectorType>(type)) {
+        auto newTyOrFailure =
+            getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
+        if (succeeded(newTyOrFailure))
+          return *newTyOrFailure;
+      }
+      return std::nullopt;
+    });
     auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
                                mlir::ValueRange inputs,
                                mlir::Location loc) -> mlir::Value {
       return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResult(0);
     };
-    // Define a vector materialization cast. If the input and output have same
-    // number of elements, perform a shape cast. Otherwise, use
-    // UnrealizedConversionCastOp to handle the conversion.
-    auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
-                                        ValueRange inputs,
-                                        Location loc) -> Value {
-      if (inputs.size() != 1)
-        return {};
-      auto input = inputs.front();
-      auto inputVecTy = dyn_cast<VectorType>(input.getType());
-      auto targetVecTy = dyn_cast<VectorType>(type);
-      if (inputVecTy && targetVecTy) {
-        if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
-          return vector::ShapeCastOp::create(builder, loc, targetVecTy, input)
-              .getResult();
-        }
-        return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
-            .getResult(0);
-      }
-
-      return {};
-    };
+    // // Define a vector materialization cast. If the input and output have
+    // same
+    // // number of elements, perform a shape cast. Otherwise, use
+    // // UnrealizedConversionCastOp to handle the conversion.
+    // auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
+    //                                     ValueRange inputs,
+    //                                     Location loc) -> Value {
+    //   if (inputs.size() != 1)
+    //     return {};
+    //   auto input = inputs.front();
+    //   auto inputVecTy = dyn_cast<VectorType>(input.getType());
+    //   auto targetVecTy = dyn_cast<VectorType>(type);
+    //   if (inputVecTy && targetVecTy) {
+    //     if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
+    //       return vector::ShapeCastOp::create(builder, loc, targetVecTy,
+    //       input)
+    //           .getResult();
+    //     }
+    //     return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+    //         .getResult(0);
+    //   }
+
+    //   return {};
+    // };
     typeConverter.addSourceMaterialization(materializeCast);
     typeConverter.addTargetMaterialization(materializeCast);
-    typeConverter.addSourceMaterialization(vectorMaterializationCast);
-    typeConverter.addTargetMaterialization(vectorMaterializationCast);
+    // typeConverter.addSourceMaterialization(vectorMaterializationCast);
+    // typeConverter.addTargetMaterialization(vectorMaterializationCast);
 
     ConversionTarget target(*ctx);
     // CreateNdDescOp is legal only if its result type has no layout attribute.

>From 47e66a487fa3daa380ca97440402dcb5bcfa27e6 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 16 Jan 2026 23:41:13 +0000
Subject: [PATCH 06/18] save work

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |  17 ++
 .../XeGPUSgToWiDistributeExperimental.cpp     | 185 ++++++++++++++++--
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  44 +----
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  29 +++
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |  30 +++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 122 ++++++------
 6 files changed, 304 insertions(+), 123 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 6573343a8bc97..d327a431d6ec4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -63,6 +63,23 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
 FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
                                                LayoutAttr layout);
 
+/// Helper function to get distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If
+/// array_length > 1, that is appended to the front of the distributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+FailureOr<VectorType>
+getDistVecTypeBasedOnLaneLayout(DistributeLayoutAttr layout,
+                                VectorType originalType);
+
 /// Extract a set of small vectors from a value with a given shape using
 /// vector.extract_stride_slice
 SmallVector<Value> extractVectorsWithShapeFromValue(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 31e8b070b0e8b..ed39a1ee39918 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -12,6 +12,9 @@
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/LogicalResult.h"
@@ -28,6 +31,17 @@ using namespace mlir;
 
 namespace {
 
+static Value resolveTy(ConversionPatternRewriter &rewriter,
+                       TypedValue<VectorType> v, VectorType expectedTy) {
+  if (v.getType() == expectedTy)
+    return v;
+  assert(v.getType().getElementType() == expectedTy.getElementType() &&
+         "element types must match");
+  assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
+         "total number of elements must match");
+  return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
+}
+
 struct CreateNdDescOpPattern
     : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
@@ -63,17 +77,24 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
     if (op.getTensorDescType().getLayout() != layout)
       return rewriter.notifyMatchFailure(
           op, "conflicting layout attributes on tensor descriptor and anchor");
-    auto distributedVectorTypeOrFailure =
+    auto supportedWiResultTyOrFailure =
         xegpu::getDistributedVectorType(op.getTensorDescType());
-    if (failed(distributedVectorTypeOrFailure))
+    auto expectedWiResultTyOrFailure =
+        xegpu::getDistVecTypeBasedOnLaneLayout(layout, op.getType());
+    if (failed(supportedWiResultTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute distributed vector type from the layout");
+          op, "unable to compute the workitem vector type for LoadNdOp");
+    if (failed(expectedWiResultTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          op,
+          "unable to compute expected workitem vector type from lane layout");
     auto newOp = xegpu::LoadNdOp::create(
-        rewriter, op.getLoc(), distributedVectorTypeOrFailure.value(),
+        rewriter, op.getLoc(), supportedWiResultTyOrFailure.value(),
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
         op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
         op.getL3HintAttr(), /**layout**/ nullptr);
-    rewriter.replaceOp(op, newOp.getResult());
+    rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
+                                     expectedWiResultTyOrFailure.value()));
     return success();
   }
 };
@@ -97,20 +118,157 @@ struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
     if (valueLayout != layout)
       return rewriter.notifyMatchFailure(
           op, "conflicting layout attributes on value and anchor");
-    auto distributedVectorTypeOrFailure =
+    auto supportedWiValueTyOrFailure =
         xegpu::getDistributedVectorType(op.getTensorDescType());
-    if (failed(distributedVectorTypeOrFailure))
+    if (failed(supportedWiValueTyOrFailure))
       return rewriter.notifyMatchFailure(
-          op, "unable to compute distributed vector type from the layout");
-    xegpu::StoreNdOp::create(rewriter, op.getLoc(), adaptor.getValue(),
-                             adaptor.getTensorDesc(), op.getMixedOffsets(),
-                             op.getL1HintAttr(), op.getL2HintAttr(),
-                             op.getL3HintAttr(), /**layout**/ nullptr);
+          op,
+          "unable to compute wi vector type for StoreNdOp value from tensor "
+          "descriptor");
+
+    xegpu::StoreNdOp::create(
+        rewriter, op.getLoc(),
+        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
+                  supportedWiValueTyOrFailure.value()),
+        adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
+        op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
     rewriter.eraseOp(op);
     return success();
   }
 };
 
+struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
+  using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(xegpu::DpasOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Check if the op has A, B and CD layouts attached.
+    auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());
+    auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());
+    auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
+    if (!layoutA || !layoutB || !layoutCd)
+      return failure();
+
+    auto wiResultTyOrFailure =
+        xegpu::getDistributedVectorType(op.getType(), layoutCd);
+    auto wiATypeOrFailure =
+        xegpu::getDistributedVectorType(op.getLhs().getType(), layoutA);
+    auto wiBTypeOrFailure =
+        xegpu::getDistributedVectorType(op.getRhs().getType(), layoutB);
+    auto expectedWiResultTyOrFailure =
+        xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());
+    if (failed(wiResultTyOrFailure) || failed(wiATypeOrFailure) ||
+        failed(wiBTypeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "failed to calculate supported workitem vector types for DpasOp "
+              "from layouts");
+    if (failed(expectedWiResultTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute expected workitem vector type for DpasOp from "
+              "lane layout");
+    auto newOp = xegpu::DpasOp::create(
+        rewriter, op->getLoc(), wiResultTyOrFailure.value(),
+        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
+                  wiATypeOrFailure.value()),
+        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
+                  wiBTypeOrFailure.value()),
+        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
+                  wiResultTyOrFailure.value()),
+        /** layoutA**/ nullptr,
+        /** layoutB**/ nullptr, /** layoutCd**/ nullptr);
+    // Explicitly set the new types to enable correct type materializations.
+    rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
+                                     expectedWiResultTyOrFailure.value()));
+    return success();
+  }
+};
+
+struct ElementWiseOpPattern : public ConversionPattern {
+  ElementWiseOpPattern(TypeConverter &typeConverter, MLIRContext *ctx)
+      : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only match ops with elementwise trait and single result.
+    if (!OpTrait::hasElementwiseMappableTraits(op) || op->getNumResults() != 1)
+      return failure();
+
+    auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());
+    if (!resultType)
+      return rewriter.notifyMatchFailure(
+          op, "operation result is not a vector type");
+
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getTemporaryLayout(llvm::cast<OpResult>(op->getResult(0)));
+    if (!layout || !layout.isForSubgroup())
+      return rewriter.notifyMatchFailure(
+          op, "operation result does not have subgroup distribute layout");
+
+    auto wiShapeOrFailure =
+        xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
+
+    if (failed(wiShapeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute workitem vector type from the layout");
+
+    VectorType newResultType = wiShapeOrFailure.value();
+    OperationState state(op->getLoc(), op->getName());
+    state.addOperands(operands);
+    state.addTypes(newResultType);
+    // Copy all attributes except for DistributeLayoutAttr.
+    for (auto attr : op->getAttrs()) {
+      if (!isa<xegpu::DistributeLayoutAttr>(attr.getValue()))
+        state.addAttribute(attr.getName(), attr.getValue());
+    }
+    Operation *newOp = rewriter.create(state);
+
+    rewriter.replaceOp(op, newOp->getResult(0));
+    return success();
+  }
+};
+
+struct ArithConstantOpPattern : public OpConversionPattern<arith::ConstantOp> {
+  using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto resultType = dyn_cast<VectorType>(op.getType());
+    if (!resultType)
+      return failure();
+
+    // Only handle dense vector constants
+    auto dense = dyn_cast<SplatElementsAttr>(op.getValue());
+    if (!dense)
+      return rewriter.notifyMatchFailure(
+          op, "only dense splat vector constants are supported");
+
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getTemporaryLayout(llvm::cast<OpResult>(op.getResult()));
+    if (!layout || !layout.isForSubgroup())
+      return rewriter.notifyMatchFailure(
+          op, "operation result does not have subgroup distribute layout");
+
+    auto wiShapeOrFailure =
+        xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
+
+    if (failed(wiShapeOrFailure))
+      return rewriter.notifyMatchFailure(
+          op, "unable to compute workitem vector type from the layout");
+
+    VectorType newResultType = wiShapeOrFailure.value();
+    auto sclarValue = dense.getSplatValue<Attribute>();
+    auto newDenseAttr = DenseElementsAttr::get(newResultType, sclarValue);
+
+    auto newOp = arith::ConstantOp::create(rewriter, op.getLoc(), newResultType,
+                                           newDenseAttr);
+    rewriter.replaceOp(op, newOp.getResult());
+    return success();
+  }
+};
+
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -141,6 +299,7 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
 void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
     RewritePatternSet &patterns, TypeConverter &typeConverter) {
-  patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern>(
+  patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern,
+               DpasOpPattern, ElementWiseOpPattern, ArithConstantOpPattern>(
       typeConverter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9113f00ac39f0..8beadeb5da309 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -65,48 +65,6 @@ namespace {
 static constexpr unsigned regularPatternBenefit = 1;
 static constexpr unsigned highPatternBenefit = 2;
 
-/// Helper function to get  distributed vector type for a source vector type
-/// according to the lane_layout. We simply divide each dimension of tensor
-/// descriptor shape by corresponding lane_layout dimension. If
-/// array_length > 1, that is appended to the front of the ditributed shape.
-/// NOTE: This is the vector type that will be returned by the
-/// gpu.warp_execute_on_lane0 op.
-///
-/// Examples:
-/// | original vector shape | lane_layout | distributed vector shape |
-/// |-----------------------|-------------|--------------------------|
-/// | 32x16                 | [1, 16]     | 32x1                     |
-/// | 32x16                 | [2, 8]      | 16x2                     |
-/// | 2x32x16               | [1, 16]     | 2x32x1                   |
-static FailureOr<VectorType>
-getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
-                                VectorType originalType) {
-  if (!layout)
-    return failure();
-  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
-         "Expecting a valid layout.");
-  SmallVector<int64_t> effectiveLaneLayout =
-      layout.getEffectiveLaneLayoutAsInt();
-  assert(static_cast<size_t>(originalType.getRank()) >=
-             effectiveLaneLayout.size() &&
-         "Rank of the original vector type should be greater or equal to the "
-         "size of the lane layout to distribute the vector type.");
-  SmallVector<int64_t> distributedShape(originalType.getShape());
-  // Only distribute the last `laneLayout.size()` dimensions. The remaining
-  // dimensions are not distributed.
-  unsigned distributionStart =
-      originalType.getRank() - effectiveLaneLayout.size();
-  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i < distributionStart)
-      continue;
-    // Check if the dimension can be distributed evenly.
-    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
-      return failure();
-    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
-  }
-  return VectorType::get(distributedShape, originalType.getElementType());
-}
-
 /// Helper function to resolve types if the distributed type out of
 /// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
 /// Example 1:
@@ -409,7 +367,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
           storeOp, "the source tensor descriptor lacks layout attribute");
 
     FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
-        getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
+        xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
     if (failed(distributedTypeByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(storeOp,
                                          "Failed to distribute the type");
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 51783b41c4c96..addec519c405e 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -101,6 +101,35 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
   return xegpu::getDistributedVectorType(helperTdescTy);
 }
 
+FailureOr<VectorType>
+xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
+                                       VectorType originalType) {
+  if (!layout)
+    return failure();
+  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
+         "Expecting a valid layout.");
+  SmallVector<int64_t> effectiveLaneLayout =
+      layout.getEffectiveLaneLayoutAsInt();
+  assert(static_cast<size_t>(originalType.getRank()) >=
+             effectiveLaneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  // Only distribute the last `laneLayout.size()` dimensions. The remaining
+  // dimensions are not distributed.
+  unsigned distributionStart =
+      originalType.getRank() - effectiveLaneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart)
+      continue;
+    // Check if the dimension can be distributed evenly.
+    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
+  }
+  return VectorType::get(distributedShape, originalType.getElementType());
+}
+
 std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
   const StringRef prefix("layout_operand_");
   unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index a8a654f3b4759..c89586f450164 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -36,4 +36,34 @@ gpu.func @store_nd() {
   gpu.return
 }
 
+gpu.func @dpas_op() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  %5 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    dense<0.0> : vector<8x16xf32>
+  %2 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %5
+    {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+     layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+     layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>  -> vector<8x16xf32>
+  gpu.return
+}
+
+gpu.func @elementwise_op() {
+  %c0 = arith.constant 0 : index
+  %0 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    dense<1.0> : vector<16x16xf32>
+  %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %2 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
+  %3 = arith.addf %0, %2
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : vector<16x16xf32>
+  gpu.return
+}
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 6a6b5f6c6a856..b3cd4ebdc71d0 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -6,8 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -18,6 +20,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/raw_ostream.h"
 #include <optional>
 
 using namespace mlir;
@@ -250,35 +253,6 @@ struct TestXeGPUSGDistribute
   }
 };
 
-static FailureOr<VectorType>
-getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
-                                VectorType originalType) {
-  if (!layout)
-    return failure();
-  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
-         "Expecting a valid layout.");
-  SmallVector<int64_t> effectiveLaneLayout =
-      layout.getEffectiveLaneLayoutAsInt();
-  assert(static_cast<size_t>(originalType.getRank()) >=
-             effectiveLaneLayout.size() &&
-         "Rank of the original vector type should be greater or equal to the "
-         "size of the lane layout to distribute the vector type.");
-  SmallVector<int64_t> distributedShape(originalType.getShape());
-  // Only distribute the last `laneLayout.size()` dimensions. The remaining
-  // dimensions are not distributed.
-  unsigned distributionStart =
-      originalType.getRank() - effectiveLaneLayout.size();
-  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
-    if (i < distributionStart)
-      continue;
-    // Check if the dimension can be distributed evenly.
-    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
-      return failure();
-    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
-  }
-  return VectorType::get(distributedShape, originalType.getElementType());
-}
-
 struct TestXeGPUSgToWiDistributeExperimental
     : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -311,7 +285,12 @@ struct TestXeGPUSgToWiDistributeExperimental
     MLIRContext *ctx = &getContext();
 
     TypeConverter typeConverter;
-    typeConverter.addConversion([](Type type) { return type; });
+    typeConverter.addConversion([](Type type) -> std::optional<Type> {
+      // non tensor_desc and vector types are legal as is.
+      if (!isa<TensorDescType, VectorType>(type))
+        return type;
+      return std::nullopt;
+    });
     typeConverter.addConversion([](TensorDescType type) -> Type {
       if (type.getLayoutAttr()) {
         return type.dropLayouts();
@@ -324,16 +303,16 @@ struct TestXeGPUSgToWiDistributeExperimental
       // If no valid layout, nothing to do.
       if (!layout || !layout.isForSubgroup())
         return std::nullopt;
-      Operation *op = v.getDefiningOp();
-      if (isa<LoadNdOp>(op)) {
-        auto loadNdOp = cast<LoadNdOp>(op);
-        layout = loadNdOp.getAnchorLayout();
-        auto newTyOrFailure =
-            getDistributedVectorType(loadNdOp.getTensorDescType());
-        if (succeeded(newTyOrFailure))
-          return *newTyOrFailure;
-        return std::nullopt;
-      }
+      // Operation *op = v.getDefiningOp();
+      // if (isa<LoadNdOp>(op)) {
+      //   auto loadNdOp = cast<LoadNdOp>(op);
+      //   layout = loadNdOp.getAnchorLayout();
+      //   auto newTyOrFailure =
+      //       getDistributedVectorType(loadNdOp.getTensorDescType());
+      //   if (succeeded(newTyOrFailure))
+      //     return *newTyOrFailure;
+      //   return std::nullopt;
+      // }
       // For other vector types, distribute based on the lane layout.
       if (isa<VectorType>(type)) {
         auto newTyOrFailure =
@@ -349,34 +328,8 @@ struct TestXeGPUSgToWiDistributeExperimental
       return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResult(0);
     };
-    // // Define a vector materialization cast. If the input and output have
-    // same
-    // // number of elements, perform a shape cast. Otherwise, use
-    // // UnrealizedConversionCastOp to handle the conversion.
-    // auto vectorMaterializationCast = [](OpBuilder &builder, Type type,
-    //                                     ValueRange inputs,
-    //                                     Location loc) -> Value {
-    //   if (inputs.size() != 1)
-    //     return {};
-    //   auto input = inputs.front();
-    //   auto inputVecTy = dyn_cast<VectorType>(input.getType());
-    //   auto targetVecTy = dyn_cast<VectorType>(type);
-    //   if (inputVecTy && targetVecTy) {
-    //     if (inputVecTy.getNumElements() == targetVecTy.getNumElements()) {
-    //       return vector::ShapeCastOp::create(builder, loc, targetVecTy,
-    //       input)
-    //           .getResult();
-    //     }
-    //     return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
-    //         .getResult(0);
-    //   }
-
-    //   return {};
-    // };
     typeConverter.addSourceMaterialization(materializeCast);
     typeConverter.addTargetMaterialization(materializeCast);
-    // typeConverter.addSourceMaterialization(vectorMaterializationCast);
-    // typeConverter.addTargetMaterialization(vectorMaterializationCast);
 
     ConversionTarget target(*ctx);
     // CreateNdDescOp is legal only if its result type has no layout attribute.
@@ -391,11 +344,46 @@ struct TestXeGPUSgToWiDistributeExperimental
         return true;
       return !anchorOp.getAnchorLayout();
     });
+    target.addDynamicallyLegalOp<arith::ConstantOp>(
+        [=](arith::ConstantOp op) -> bool {
+          // If the result type is not a vector, it's legal.
+          if (!isa<VectorType>(op.getResult().getType()))
+            return true;
+          // For vector result types, check if it has a layout attribute.
+          return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
+        });
+    // In math and arith dialects, only handle elementwise ops with a single
+    // result and with a result layout attribute.
+    target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+        [=](Operation *op) -> std::optional<bool> {
+          // Only handle elementwise mappable ops
+          if (!OpTrait::hasElementwiseMappableTraits(op))
+            return true;
+          // Only handle ops with single vector result
+          if (op->getNumResults() != 1)
+            return true;
+
+          VectorType resultType =
+              dyn_cast<VectorType>(op->getResult(0).getType());
+          if (!resultType)
+            return true;
+
+          // Check if all operands are vectors of the same shape
+          for (Value operand : op->getOperands()) {
+            VectorType operandType = dyn_cast<VectorType>(operand.getType());
+            if (!operandType ||
+                operandType.getShape() != resultType.getShape()) {
+              return true;
+            }
+          }
+          return !xegpu::getTemporaryLayout(
+              dyn_cast<OpResult>(op->getResult(0)));
+        });
+    target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
 
     RewritePatternSet patterns(ctx);
     xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
                                                              typeConverter);
-    target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
 
     (void)applyPartialConversion(getOperation(), target, std::move(patterns));
   }

>From 5708cc4c0c00e411beb4478c606fea59b4fb4a7d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 20 Jan 2026 19:37:42 +0000
Subject: [PATCH 07/18] save work

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |   9 +-
 .../XeGPUSgToWiDistributeExperimental.cpp     | 144 +++++++++++++++---
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 101 +-----------
 3 files changed, 131 insertions(+), 123 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 898af0ec14738..1930ef8e454d4 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -71,10 +71,11 @@ void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU workgroup to subgroup distribution into
 /// `patterns`.
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
-/// Appends patterns for XeGPU subgroup to work-item distribution into
-/// `patterns`.
-void populateXeGPUSgToWiDistributeExperimentalPatterns(
-    RewritePatternSet &patterns, TypeConverter &typeConverter);
+/// Defines type conversions and legality for XeGPU subgroup to workitem
+/// distribution and appends the required conversion patterns into `patterns`.
+void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target);
 
 /// Collect a set of patterns to unroll xegpu operations to a smaller shapes.
 /// Users can control whether an operation to be unrolled or not, as well as
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index ed39a1ee39918..0fc022241095b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -14,6 +15,7 @@
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/GraphWriter.h"
@@ -42,6 +44,33 @@ static Value resolveTy(ConversionPatternRewriter &rewriter,
   return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
 }
 
+static LogicalResult verifyLayouts(Operation *root) {
+  auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
+    if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
+      auto layout = anchorOp.getAnchorLayout();
+      if (!layout) {
+        nestedOp->emitError("expected anchor layout attribute on operation");
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    }
+    // For each vector result, check if the op contains a result layout
+    // attribute.
+    for (OpResult result : nestedOp->getResults()) {
+      if (isa<VectorType>(result.getType())) {
+        auto layout = xegpu::getDistributeLayoutAttr(result);
+        if (!layout) {
+          nestedOp->emitError(
+              "expected result layout attribute on vector result");
+          return WalkResult::interrupt();
+        }
+      }
+    }
+    return WalkResult::advance();
+  });
+  return walkResult.wasInterrupted() ? failure() : success();
+}
+
 struct CreateNdDescOpPattern
     : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
@@ -278,27 +307,102 @@ struct XeGPUSgToWiDistributeExperimentalPass
 } // namespace
 
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
-  // // Recover layouts.
-  // Operation *op = getOperation();
-  // if (!xegpu::recoverTemporaryLayouts(op)) {
-  //   signalPassFailure();
-  //   return;
-  // }
-
-  // // Define conversion target
-  // ConversionTarget target(getContext());
-  // target.addLegalDialect<index::IndexDialect, memref::MemRefDialect,
-  //                        vector::VectorDialect>();
-  // target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>(
-  //     [](Operation *op) { return true; });
-
-  // // Define type converter
-  // TypeConverter typeConverter;
-  // typeConverter.addConversion([](Type type) { return type; });
+  // Verify if all XeGPU and vector operations have layouts.
+  Operation *root = getOperation();
+  if (failed(verifyLayouts(root))) {
+    signalPassFailure();
+    return;
+  }
 }
-
-void xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(
-    RewritePatternSet &patterns, TypeConverter &typeConverter) {
+void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target) {
+
+  // Populate type conversions.
+  // - Any type other than TensorDescType and VectorType are legal as is.
+  typeConverter.addConversion([](Type type) -> std::optional<Type> {
+    if (!isa<TensorDescType, VectorType>(type))
+      return type;
+    return std::nullopt;
+  });
+  // For TensorDescType, drop the layout attribute if any.
+  typeConverter.addConversion([](TensorDescType type) -> Type {
+    if (type.getLayoutAttr()) {
+      return type.dropLayouts();
+    }
+    return type;
+  });
+  // - For VectorType, check if there is a distribute layout attribute on the
+  //   value. If so, convert to the distributed vector type based on the layout.
+  typeConverter.addConversion([](Value v) -> std::optional<Type> {
+    auto type = v.getType();
+    auto layout = xegpu::getDistributeLayoutAttr(v);
+    // If no valid layout, nothing to do.
+    if (!layout || !layout.isForSubgroup())
+      return std::nullopt;
+    // Vector type is distributed based on lane layout.
+    if (isa<VectorType>(type)) {
+      auto newTyOrFailure =
+          getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
+      if (succeeded(newTyOrFailure))
+        return *newTyOrFailure;
+    }
+    return std::nullopt;
+  });
+  // - Materialization casts are only used for testing purposes.
+  auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+                             mlir::ValueRange inputs,
+                             mlir::Location loc) -> mlir::Value {
+    return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+        .getResult(0);
+  };
+  typeConverter.addSourceMaterialization(materializeCast);
+  typeConverter.addTargetMaterialization(materializeCast);
+  // Define legality.
+  // - CreateNdDescOp is legal only if its result type has no layout attribute.
+  target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+      [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
+  // - Any anchor XeGPU op is legal only if it has no anchor layout.
+  target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
+    auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
+    if (!anchorOp)
+      return true;
+    return !anchorOp.getAnchorLayout();
+  });
+  target.addDynamicallyLegalOp<arith::ConstantOp>(
+      [=](arith::ConstantOp op) -> bool {
+        // If the result type is not a vector, it's legal.
+        if (!isa<VectorType>(op.getResult().getType()))
+          return true;
+        // For vector result types, check if it has a layout attribute.
+        return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
+      });
+  // - In math and arith dialects, only handle elementwise ops with a single
+  //   result and with a result layout attribute.
+  target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+      [=](Operation *op) -> std::optional<bool> {
+        // Only handle elementwise mappable ops
+        if (!OpTrait::hasElementwiseMappableTraits(op))
+          return true;
+        // Only handle ops with single vector result
+        if (op->getNumResults() != 1)
+          return true;
+
+        VectorType resultType =
+            dyn_cast<VectorType>(op->getResult(0).getType());
+        if (!resultType)
+          return true;
+
+        // Check if all operands are vectors of the same shape
+        for (Value operand : op->getOperands()) {
+          VectorType operandType = dyn_cast<VectorType>(operand.getType());
+          if (!operandType || operandType.getShape() != resultType.getShape()) {
+            return true;
+          }
+        }
+        return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
+      });
+  target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern,
                DpasOpPattern, ElementWiseOpPattern, ArithConstantOpPattern>(
       typeConverter, patterns.getContext());
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index b3cd4ebdc71d0..c28ebf1f8bede 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -283,108 +283,11 @@ struct TestXeGPUSgToWiDistributeExperimental
 
   void runOnOperation() override {
     MLIRContext *ctx = &getContext();
-
     TypeConverter typeConverter;
-    typeConverter.addConversion([](Type type) -> std::optional<Type> {
-      // non tensor_desc and vector types are legal as is.
-      if (!isa<TensorDescType, VectorType>(type))
-        return type;
-      return std::nullopt;
-    });
-    typeConverter.addConversion([](TensorDescType type) -> Type {
-      if (type.getLayoutAttr()) {
-        return type.dropLayouts();
-      }
-      return type;
-    });
-    typeConverter.addConversion([](Value v) -> std::optional<Type> {
-      auto type = v.getType();
-      auto layout = xegpu::getDistributeLayoutAttr(v);
-      // If no valid layout, nothing to do.
-      if (!layout || !layout.isForSubgroup())
-        return std::nullopt;
-      // Operation *op = v.getDefiningOp();
-      // if (isa<LoadNdOp>(op)) {
-      //   auto loadNdOp = cast<LoadNdOp>(op);
-      //   layout = loadNdOp.getAnchorLayout();
-      //   auto newTyOrFailure =
-      //       getDistributedVectorType(loadNdOp.getTensorDescType());
-      //   if (succeeded(newTyOrFailure))
-      //     return *newTyOrFailure;
-      //   return std::nullopt;
-      // }
-      // For other vector types, distribute based on the lane layout.
-      if (isa<VectorType>(type)) {
-        auto newTyOrFailure =
-            getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
-        if (succeeded(newTyOrFailure))
-          return *newTyOrFailure;
-      }
-      return std::nullopt;
-    });
-    auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
-                               mlir::ValueRange inputs,
-                               mlir::Location loc) -> mlir::Value {
-      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
-          .getResult(0);
-    };
-    typeConverter.addSourceMaterialization(materializeCast);
-    typeConverter.addTargetMaterialization(materializeCast);
-
     ConversionTarget target(*ctx);
-    // CreateNdDescOp is legal only if its result type has no layout attribute.
-    target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
-        [&](xegpu::CreateNdDescOp op) {
-          return !op.getType().getLayoutAttr();
-        });
-    // Any anchor XeGPU op is legal only if it has no anchor layout.
-    target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
-      auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
-      if (!anchorOp)
-        return true;
-      return !anchorOp.getAnchorLayout();
-    });
-    target.addDynamicallyLegalOp<arith::ConstantOp>(
-        [=](arith::ConstantOp op) -> bool {
-          // If the result type is not a vector, it's legal.
-          if (!isa<VectorType>(op.getResult().getType()))
-            return true;
-          // For vector result types, check if it has a layout attribute.
-          return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
-        });
-    // In math and arith dialects, only handle elementwise ops with a single
-    // result and with a result layout attribute.
-    target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
-        [=](Operation *op) -> std::optional<bool> {
-          // Only handle elementwise mappable ops
-          if (!OpTrait::hasElementwiseMappableTraits(op))
-            return true;
-          // Only handle ops with single vector result
-          if (op->getNumResults() != 1)
-            return true;
-
-          VectorType resultType =
-              dyn_cast<VectorType>(op->getResult(0).getType());
-          if (!resultType)
-            return true;
-
-          // Check if all operands are vectors of the same shape
-          for (Value operand : op->getOperands()) {
-            VectorType operandType = dyn_cast<VectorType>(operand.getType());
-            if (!operandType ||
-                operandType.getShape() != resultType.getShape()) {
-              return true;
-            }
-          }
-          return !xegpu::getTemporaryLayout(
-              dyn_cast<OpResult>(op->getResult(0)));
-        });
-    target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
-
     RewritePatternSet patterns(ctx);
-    xegpu::populateXeGPUSgToWiDistributeExperimentalPatterns(patterns,
-                                                             typeConverter);
-
+    xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+        typeConverter, patterns, target);
     (void)applyPartialConversion(getOperation(), target, std::move(patterns));
   }
 };

>From 4579c154bbdd8cfc55c5f5e6858b39b53adc8fee Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 06:24:52 +0000
Subject: [PATCH 08/18] save working version

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |   5 +
 .../XeGPUSgToWiDistributeExperimental.cpp     | 263 +++++++++++++++---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   3 +-
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |   3 +
 4 files changed, 240 insertions(+), 34 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 1930ef8e454d4..35a5154a2ce59 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -71,8 +71,13 @@ void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU workgroup to subgroup distribution into
 /// `patterns`.
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
+/// Define only the type conversions needed for XeGPU subgroup to workitem
+/// distribution.
+void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
 /// Defines type conversions and legality for XeGPU subgroup to workitem
 /// distribution and appends the required conversion patterns into `patterns`.
+/// Appends patterns for XeGPU subgroup to workitem distribution into
+/// `patterns`.
 void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 0fc022241095b..b79cf652083de 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -8,19 +8,27 @@
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
+#include <optional>
 
 namespace mlir {
 namespace xegpu {
@@ -31,17 +39,30 @@ namespace xegpu {
 
 using namespace mlir;
 
+#define DEBUG_TYPE "xegpu-sg-to-wi-distribute-experimental"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
 namespace {
 
 static Value resolveTy(ConversionPatternRewriter &rewriter,
                        TypedValue<VectorType> v, VectorType expectedTy) {
+  // llvm::errs() << "value:" << v << " expectedTy: " << expectedTy << "\n";
   if (v.getType() == expectedTy)
     return v;
-  assert(v.getType().getElementType() == expectedTy.getElementType() &&
-         "element types must match");
-  assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
-         "total number of elements must match");
-  return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
+  // assert(v.getType().getElementType() == expectedTy.getElementType() &&
+  //        "element types must match");
+  // assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
+  //        "total number of elements must match");
+  // If both types are vector type and number of elements match, insert a shape
+  // cast.
+  if (isa<VectorType>(v.getType()) &&
+      v.getType().getNumElements() == expectedTy.getNumElements())
+    return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
+
+  // else create an unrealized cast.
+  auto newOp = UnrealizedConversionCastOp::create(rewriter, v.getLoc(),
+                                                  expectedTy, ValueRange{v});
+  return newOp.getResult(0);
 }
 
 static LogicalResult verifyLayouts(Operation *root) {
@@ -172,13 +193,14 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
   LogicalResult
   matchAndRewrite(xegpu::DpasOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    // llvm::errs() << "DpasOpPattern matchAndRewrite called\n";
     // Check if the op has A, B and CD layouts attached.
     auto layoutA = cast<xegpu::LayoutAttr>(op.getLayoutAAttr());
     auto layoutB = cast<xegpu::LayoutAttr>(op.getLayoutBAttr());
     auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
     if (!layoutA || !layoutB || !layoutCd)
       return failure();
-
+    // llvm::errs() << "tryning to calculate wi types for dpas op\n";
     auto wiResultTyOrFailure =
         xegpu::getDistributedVectorType(op.getType(), layoutCd);
     auto wiATypeOrFailure =
@@ -196,6 +218,8 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
       return rewriter.notifyMatchFailure(
           op, "unable to compute expected workitem vector type for DpasOp from "
               "lane layout");
+    // llvm::errs() << "adaptor acc type: " << adaptor.getAcc().getType() <<
+    // "\n"; llvm::errs() << "ops acc type: " << op.getAcc().getType() << "\n";
     auto newOp = xegpu::DpasOp::create(
         rewriter, op->getLoc(), wiResultTyOrFailure.value(),
         resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
@@ -307,17 +331,194 @@ struct XeGPUSgToWiDistributeExperimentalPass
 } // namespace
 
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
-  // Verify if all XeGPU and vector operations have layouts.
+
+  // llvm::errs() << "Running XeGPUSgToWiDistributeExperimentalPass\n";
+  // Verify if all XeGPU anchor ops and vector ops have result layouts.
   Operation *root = getOperation();
-  if (failed(verifyLayouts(root))) {
-    signalPassFailure();
-    return;
+  // if (failed(verifyLayouts(root))) {
+  //   LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
+  //                        "verification failed\n");
+  //   signalPassFailure();
+  //   return;
+  // }
+  // Collect existing UnrealizedConversionCastOps.
+  llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
+  // root->walk(
+  //     [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp);
+  //     });
+  // Perform a structural type conversion. This will insert
+  // UnrealizedConversionCastOps for type materializations.
+  // auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+  //                            mlir::ValueRange inputs,
+  //                            mlir::Location loc) -> mlir::Value {
+  //   // If single input and both input and output types are vector types,
+  //   // and they have same number of elements, insert a shape cast.
+  //   // if (inputs.size() == 1) {
+  //   //   auto inputTy = dyn_cast<VectorType>(inputs[0].getType());
+  //   //   auto outputTy = dyn_cast<VectorType>(type);
+  //   //   if (inputTy && outputTy &&
+  //   //       inputTy.getNumElements() == outputTy.getNumElements()) {
+  //   //     return vector::ShapeCastOp::create(builder, loc, outputTy,
+  //   //     inputs[0])
+  //   //         .getResult();
+  //   //   }
+  //   // }
+  //   UnrealizedConversionCastOp castOp =
+  //       UnrealizedConversionCastOp::create(builder, loc, type, inputs);
+
+  //   // // If inputs is a single vector type and type is also a vector, then
+  //   // layout
+  //   // // must be propagated.
+  //   // if (inputs.size() == 1 && isa<VectorType>(inputs[0].getType()) &&
+  //   //     isa<VectorType>(type)) {
+  //   //   auto layout = xegpu::getDistributeLayoutAttr(inputs[0]);
+  //   //   if (layout)
+  //   //     xegpu::setDistributeLayoutAttr(castOp->getOpResult(0), layout);
+  //   // }
+
+  //   return castOp.getResult(0);
+  // };
+  // {
+  //   ConversionTarget target(getContext());
+  //   TypeConverter typeConverter;
+  //   RewritePatternSet patterns(&getContext());
+  //   typeConverter.addSourceMaterialization(materializeCast);
+  //   typeConverter.addTargetMaterialization(materializeCast);
+  //   xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+  //   scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+  //                                                        patterns, target);
+  //   target.addLegalOp<UnrealizedConversionCastOp>();
+  //   (void)applyPartialConversion(root, target, std::move(patterns));
+  // }
+  // // Apply the XeGPU subgroup to workitem distribution patterns.
+  // {
+  //   ConversionTarget target(getContext());
+  //   TypeConverter typeConverter;
+  //   typeConverter.addTargetMaterialization(materializeCast);
+  //   typeConverter.addSourceMaterialization(materializeCast);
+  //   RewritePatternSet patterns(&getContext());
+  //   xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+  //       typeConverter, patterns, target);
+  //   target.addLegalOp<UnrealizedConversionCastOp>();
+  //   (void)applyPartialConversion(root, target, std::move(patterns));
+  // }
+  // UnrealizedConversionCastOp is legal if it existed before.
+  // target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+  //     [&](UnrealizedConversionCastOp op) {
+  //       return existingCasts.contains(op);
+  //     });
+  // Define a pattern for handling UnrealizedConversionCastOps that were
+  // newly created during the structural type conversion.
+  class ResolveUnrealizedCastPattern
+      : public OpConversionPattern<UnrealizedConversionCastOp> {
+  public:
+    // Pass existsingCasts in the constructor to identify existing casts.
+    ResolveUnrealizedCastPattern(
+        TypeConverter &typeConverter,
+        llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts,
+        MLIRContext &ctx)
+        : OpConversionPattern<UnrealizedConversionCastOp>(typeConverter, &ctx),
+          existingCasts(existingCasts) {}
+    // using
+    // OpConversionPattern<UnrealizedConversionCastOp>::OpConversionPattern;
+    LogicalResult
+    matchAndRewrite(UnrealizedConversionCastOp op, OpAdaptor adaptor,
+                    ConversionPatternRewriter &rewriter) const override {
+      // If this op existed before, nothing to do.
+      if (existingCasts.contains(op))
+        return failure();
+      // number of inputs and outputs must be 1.
+      if (op.getNumOperands() != 1 || op.getNumResults() != 1)
+        return failure();
+      // Both input and output types must be vector types.
+      auto singleInput = op.getInputs()[0];
+      auto inputTy = dyn_cast<VectorType>(singleInput.getType());
+      auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
+      llvm::errs() << "input ty : " << inputTy << " output ty: " << outputTy
+                   << "\n";
+      if (!inputTy || !outputTy)
+        return failure();
+
+      // Check if the defining op of the input is also an
+      // UnrealizedConversionCastOp.
+      auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
+      if (!definingOp)
+        return rewriter.notifyMatchFailure(
+            op, "input defining op is not an UnrealizedConversionCastOp");
+      auto inputOfDefiningOp = definingOp.getInputs()[0];
+      // If the input of the defining op and output type are both vector types
+      // have same number of elements, insert a shape cast.
+      auto inputOfDefiningOpTy =
+          dyn_cast<VectorType>(inputOfDefiningOp.getType());
+      if (inputOfDefiningOpTy && outputTy &&
+          inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
+        auto shapeCast = vector::ShapeCastOp::create(
+            rewriter, op.getLoc(), outputTy, inputOfDefiningOp);
+        rewriter.replaceOp(op, shapeCast.getResult());
+        return success();
+      }
+
+      return rewriter.notifyMatchFailure(
+          op, "unable to resolve unrealized conversion cast");
+    }
+
+  private:
+    llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts;
+  };
+  // Finally, remove unnecessary UnrealizedConversionCastOps.
+  OpBuilder builder(root);
+  root->walk([&](UnrealizedConversionCastOp op) {
+    // If this op existed before, nothing to do.
+    if (existingCasts.contains(op))
+      return;
+    // number of inputs and outputs must be 1.
+    if (op.getNumOperands() != 1 || op.getNumResults() != 1)
+      return;
+    // Both input and output types must be vector types.
+    auto singleInput = op.getInputs()[0];
+    auto inputTy = dyn_cast<VectorType>(singleInput.getType());
+    auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
+    if (!inputTy || !outputTy)
+      return;
+
+    // Check if the defining op of the input is also an
+    // UnrealizedConversionCastOp and it has a single user (which is this op).
+    auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
+    if (!definingOp || !definingOp->hasOneUse())
+      return;
+    auto inputOfDefiningOp = definingOp.getInputs()[0];
+    // If the input of the defining op and output type are both vector types
+    // have same number of elements, insert a shape cast.
+    auto inputOfDefiningOpTy =
+        dyn_cast<VectorType>(inputOfDefiningOp.getType());
+    if (inputOfDefiningOpTy &&
+        inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
+      builder.setInsertionPoint(op);
+      auto shapeCast = vector::ShapeCastOp::create(builder, op.getLoc(),
+                                                   outputTy, inputOfDefiningOp);
+      op.replaceAllUsesWith(ValueRange{shapeCast.getResult()});
+      return;
+    }
+  });
+  // At this point, we will have some dead UnrealizedConversionCastOps. Just
+  // erase them.
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    root->walk([&](UnrealizedConversionCastOp op) {
+      // Skip existing casts.
+      if (existingCasts.contains(op))
+        return;
+      if (op.use_empty()) {
+        op.erase();
+        changed = true;
+      }
+    });
   }
 }
-void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
-    TypeConverter &typeConverter, RewritePatternSet &patterns,
-    ConversionTarget &target) {
 
+void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
+    TypeConverter &typeConverter) {
   // Populate type conversions.
   // - Any type other than TensorDescType and VectorType are legal as is.
   typeConverter.addConversion([](Type type) -> std::optional<Type> {
@@ -325,7 +526,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       return type;
     return std::nullopt;
   });
-  // For TensorDescType, drop the layout attribute if any.
+  // - For TensorDescType, drop the layout attribute if any.
   typeConverter.addConversion([](TensorDescType type) -> Type {
     if (type.getLayoutAttr()) {
       return type.dropLayouts();
@@ -336,29 +537,25 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
   //   value. If so, convert to the distributed vector type based on the layout.
   typeConverter.addConversion([](Value v) -> std::optional<Type> {
     auto type = v.getType();
+    // If value is not vector type, nothing to do.
+    if (!isa<VectorType>(type))
+      return std::nullopt;
     auto layout = xegpu::getDistributeLayoutAttr(v);
-    // If no valid layout, nothing to do.
     if (!layout || !layout.isForSubgroup())
-      return std::nullopt;
+      return type;
     // Vector type is distributed based on lane layout.
-    if (isa<VectorType>(type)) {
-      auto newTyOrFailure =
-          getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
-      if (succeeded(newTyOrFailure))
-        return *newTyOrFailure;
-    }
-    return std::nullopt;
+    auto newTyOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, cast<VectorType>(type));
+    if (failed(newTyOrFailure))
+      return type;
+    return *newTyOrFailure;
   });
-  // - Materialization casts are only used for testing purposes.
-  auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
-                             mlir::ValueRange inputs,
-                             mlir::Location loc) -> mlir::Value {
-    return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
-        .getResult(0);
-  };
-  typeConverter.addSourceMaterialization(materializeCast);
-  typeConverter.addTargetMaterialization(materializeCast);
-  // Define legality.
+}
+
+void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+    TypeConverter &typeConverter, RewritePatternSet &patterns,
+    ConversionTarget &target) {
+  populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
   // - CreateNdDescOp is legal only if its result type has no layout attribute.
   target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
       [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index addec519c405e..1a1f331efe608 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -21,6 +21,7 @@
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
@@ -168,7 +169,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto *parentOp = arg.getOwner()->getParentOp();
-    if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
+    if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
       if (tiedInit)
         return getDistributeLayoutAttr(tiedInit->get());
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index c28ebf1f8bede..03a11908499e1 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -288,6 +289,8 @@ struct TestXeGPUSgToWiDistributeExperimental
     RewritePatternSet patterns(ctx);
     xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         typeConverter, patterns, target);
+    scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+                                                         patterns, target);
     (void)applyPartialConversion(getOperation(), target, std::move(patterns));
   }
 };

>From 63eed8461498870fc46014c17c817a0af178c299 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 17:42:23 +0000
Subject: [PATCH 09/18] save working version

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 211 +++++-------------
 1 file changed, 60 insertions(+), 151 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index b79cf652083de..8005637da4bf8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -44,17 +44,10 @@ using namespace mlir;
 
 namespace {
 
-static Value resolveTy(ConversionPatternRewriter &rewriter,
-                       TypedValue<VectorType> v, VectorType expectedTy) {
-  // llvm::errs() << "value:" << v << " expectedTy: " << expectedTy << "\n";
+static Value castValueTo(ConversionPatternRewriter &rewriter,
+                         TypedValue<VectorType> v, VectorType expectedTy) {
   if (v.getType() == expectedTy)
     return v;
-  // assert(v.getType().getElementType() == expectedTy.getElementType() &&
-  //        "element types must match");
-  // assert(v.getType().getNumElements() == expectedTy.getNumElements() &&
-  //        "total number of elements must match");
-  // If both types are vector type and number of elements match, insert a shape
-  // cast.
   if (isa<VectorType>(v.getType()) &&
       v.getType().getNumElements() == expectedTy.getNumElements())
     return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
@@ -143,8 +136,8 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
         op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
         op.getL3HintAttr(), /**layout**/ nullptr);
-    rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
-                                     expectedWiResultTyOrFailure.value()));
+    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
+                                       expectedWiResultTyOrFailure.value()));
     return success();
   }
 };
@@ -178,8 +171,8 @@ struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
 
     xegpu::StoreNdOp::create(
         rewriter, op.getLoc(),
-        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
-                  supportedWiValueTyOrFailure.value()),
+        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
+                    supportedWiValueTyOrFailure.value()),
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
         op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
     rewriter.eraseOp(op);
@@ -222,17 +215,17 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
     // "\n"; llvm::errs() << "ops acc type: " << op.getAcc().getType() << "\n";
     auto newOp = xegpu::DpasOp::create(
         rewriter, op->getLoc(), wiResultTyOrFailure.value(),
-        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
-                  wiATypeOrFailure.value()),
-        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
-                  wiBTypeOrFailure.value()),
-        resolveTy(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
-                  wiResultTyOrFailure.value()),
+        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
+                    wiATypeOrFailure.value()),
+        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
+                    wiBTypeOrFailure.value()),
+        castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
+                    wiResultTyOrFailure.value()),
         /** layoutA**/ nullptr,
         /** layoutB**/ nullptr, /** layoutCd**/ nullptr);
     // Explicitly set the new types to enable correct type materializations.
-    rewriter.replaceOp(op, resolveTy(rewriter, newOp.getResult(),
-                                     expectedWiResultTyOrFailure.value()));
+    rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
+                                       expectedWiResultTyOrFailure.value()));
     return success();
   }
 };
@@ -332,140 +325,56 @@ struct XeGPUSgToWiDistributeExperimentalPass
 
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
-  // llvm::errs() << "Running XeGPUSgToWiDistributeExperimentalPass\n";
   // Verify if all XeGPU anchor ops and vector ops have result layouts.
   Operation *root = getOperation();
-  // if (failed(verifyLayouts(root))) {
-  //   LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
-  //                        "verification failed\n");
-  //   signalPassFailure();
-  //   return;
-  // }
-  // Collect existing UnrealizedConversionCastOps.
+  if (failed(verifyLayouts(root))) {
+    LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
+                         "verification failed\n");
+    signalPassFailure();
+    return;
+  }
+  // Collect existing UnrealizedConversionCastOps. These must be preserved.
   llvm::SmallSetVector<UnrealizedConversionCastOp, 8> existingCasts;
-  // root->walk(
-  //     [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp);
-  //     });
-  // Perform a structural type conversion. This will insert
-  // UnrealizedConversionCastOps for type materializations.
-  // auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
-  //                            mlir::ValueRange inputs,
-  //                            mlir::Location loc) -> mlir::Value {
-  //   // If single input and both input and output types are vector types,
-  //   // and they have same number of elements, insert a shape cast.
-  //   // if (inputs.size() == 1) {
-  //   //   auto inputTy = dyn_cast<VectorType>(inputs[0].getType());
-  //   //   auto outputTy = dyn_cast<VectorType>(type);
-  //   //   if (inputTy && outputTy &&
-  //   //       inputTy.getNumElements() == outputTy.getNumElements()) {
-  //   //     return vector::ShapeCastOp::create(builder, loc, outputTy,
-  //   //     inputs[0])
-  //   //         .getResult();
-  //   //   }
-  //   // }
-  //   UnrealizedConversionCastOp castOp =
-  //       UnrealizedConversionCastOp::create(builder, loc, type, inputs);
-
-  //   // // If inputs is a single vector type and type is also a vector, then
-  //   // layout
-  //   // // must be propagated.
-  //   // if (inputs.size() == 1 && isa<VectorType>(inputs[0].getType()) &&
-  //   //     isa<VectorType>(type)) {
-  //   //   auto layout = xegpu::getDistributeLayoutAttr(inputs[0]);
-  //   //   if (layout)
-  //   //     xegpu::setDistributeLayoutAttr(castOp->getOpResult(0), layout);
-  //   // }
-
-  //   return castOp.getResult(0);
-  // };
-  // {
-  //   ConversionTarget target(getContext());
-  //   TypeConverter typeConverter;
-  //   RewritePatternSet patterns(&getContext());
-  //   typeConverter.addSourceMaterialization(materializeCast);
-  //   typeConverter.addTargetMaterialization(materializeCast);
-  //   xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
-  //   scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
-  //                                                        patterns, target);
-  //   target.addLegalOp<UnrealizedConversionCastOp>();
-  //   (void)applyPartialConversion(root, target, std::move(patterns));
-  // }
-  // // Apply the XeGPU subgroup to workitem distribution patterns.
-  // {
-  //   ConversionTarget target(getContext());
-  //   TypeConverter typeConverter;
-  //   typeConverter.addTargetMaterialization(materializeCast);
-  //   typeConverter.addSourceMaterialization(materializeCast);
-  //   RewritePatternSet patterns(&getContext());
-  //   xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
-  //       typeConverter, patterns, target);
-  //   target.addLegalOp<UnrealizedConversionCastOp>();
-  //   (void)applyPartialConversion(root, target, std::move(patterns));
-  // }
-  // UnrealizedConversionCastOp is legal if it existed before.
-  // target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
-  //     [&](UnrealizedConversionCastOp op) {
-  //       return existingCasts.contains(op);
-  //     });
-  // Define a pattern for handling UnrealizedConversionCastOps that were
-  // newly created during the structural type conversion.
-  class ResolveUnrealizedCastPattern
-      : public OpConversionPattern<UnrealizedConversionCastOp> {
-  public:
-    // Pass existsingCasts in the constructor to identify existing casts.
-    ResolveUnrealizedCastPattern(
-        TypeConverter &typeConverter,
-        llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts,
-        MLIRContext &ctx)
-        : OpConversionPattern<UnrealizedConversionCastOp>(typeConverter, &ctx),
-          existingCasts(existingCasts) {}
-    // using
-    // OpConversionPattern<UnrealizedConversionCastOp>::OpConversionPattern;
-    LogicalResult
-    matchAndRewrite(UnrealizedConversionCastOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override {
-      // If this op existed before, nothing to do.
-      if (existingCasts.contains(op))
-        return failure();
-      // number of inputs and outputs must be 1.
-      if (op.getNumOperands() != 1 || op.getNumResults() != 1)
-        return failure();
-      // Both input and output types must be vector types.
-      auto singleInput = op.getInputs()[0];
-      auto inputTy = dyn_cast<VectorType>(singleInput.getType());
-      auto outputTy = dyn_cast<VectorType>(op.getResult(0).getType());
-      llvm::errs() << "input ty : " << inputTy << " output ty: " << outputTy
-                   << "\n";
-      if (!inputTy || !outputTy)
-        return failure();
-
-      // Check if the defining op of the input is also an
-      // UnrealizedConversionCastOp.
-      auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
-      if (!definingOp)
-        return rewriter.notifyMatchFailure(
-            op, "input defining op is not an UnrealizedConversionCastOp");
-      auto inputOfDefiningOp = definingOp.getInputs()[0];
-      // If the input of the defining op and output type are both vector types
-      // have same number of elements, insert a shape cast.
-      auto inputOfDefiningOpTy =
-          dyn_cast<VectorType>(inputOfDefiningOp.getType());
-      if (inputOfDefiningOpTy && outputTy &&
-          inputOfDefiningOpTy.getNumElements() == outputTy.getNumElements()) {
-        auto shapeCast = vector::ShapeCastOp::create(
-            rewriter, op.getLoc(), outputTy, inputOfDefiningOp);
-        rewriter.replaceOp(op, shapeCast.getResult());
-        return success();
-      }
-
-      return rewriter.notifyMatchFailure(
-          op, "unable to resolve unrealized conversion cast");
-    }
-
-  private:
-    llvm::SmallSetVector<UnrealizedConversionCastOp, 8> &existingCasts;
+  root->walk(
+      [&](UnrealizedConversionCastOp castOp) { existingCasts.insert(castOp); });
+  // Perform a structural type conversion to convert structural ops to have WI
+  // types. This will insert UnrealizedConversionCastOps to make the IR
+  // valid.
+  auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+                             mlir::ValueRange inputs,
+                             mlir::Location loc) -> mlir::Value {
+    UnrealizedConversionCastOp castOp =
+        UnrealizedConversionCastOp::create(builder, loc, type, inputs);
+    return castOp.getResult(0);
   };
-  // Finally, remove unnecessary UnrealizedConversionCastOps.
+  {
+    ConversionTarget target(getContext());
+    TypeConverter typeConverter;
+    RewritePatternSet patterns(&getContext());
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
+    xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+    scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+                                                         patterns, target);
+    target.addLegalOp<UnrealizedConversionCastOp>();
+    (void)applyPartialConversion(root, target, std::move(patterns));
+  }
+  // Apply the XeGPU subgroup to workitem distribution patterns.
+  {
+    ConversionTarget target(getContext());
+    TypeConverter typeConverter;
+    typeConverter.addTargetMaterialization(materializeCast);
+    typeConverter.addSourceMaterialization(materializeCast);
+    RewritePatternSet patterns(&getContext());
+    xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+        typeConverter, patterns, target);
+    target.addLegalOp<UnrealizedConversionCastOp>();
+    (void)applyPartialConversion(root, target, std::move(patterns));
+  }
+  // Structural type conversion can generate some redundant
+  // UnrealizedConversionCastOps to materialize the SG type from type converted
+  // WI type. These are redundant at this point and can be eliminated by
+  // inserting shape casts instead.
   OpBuilder builder(root);
   root->walk([&](UnrealizedConversionCastOp op) {
     // If this op existed before, nothing to do.

>From 2cd613fc7b65fe57dcd7d36fdbd3c194052fab2f Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 20:25:59 +0000
Subject: [PATCH 10/18] pack and unpack support

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     | 12 +++++++
 .../XeGPUSgToWiDistributeExperimental.cpp     | 12 +++++++
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 32 ++-----------------
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 25 +++++++++++++++
 4 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index d327a431d6ec4..9f391332df37a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -26,6 +26,10 @@ namespace xegpu {
 class DistributeLayoutAttr;
 class LayoutAttr;
 class TensorDescType;
+
+namespace uArch {
+class uArch;
+} // namespace uArch
 } // namespace xegpu
 
 namespace xegpu {
@@ -207,6 +211,14 @@ void recoverTemporaryLayoutsDeprecated(Operation *op);
 /// a layout attribute.
 bool recoverTemporaryLayouts(Operation *rootOp);
 
+/// Helper function to check if the layout is packed. Layout is packed if it is
+/// 2D and lane_data[0] != 1 (data packed from col dimension).
+/// TODO: Move to target info.
+bool requirePacked(const LayoutAttr layout);
+
+/// Helper function to check if the layout requires a transpose effect.
+bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch);
+
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 8005637da4bf8..c83c9a53c8734 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -120,6 +121,12 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
     if (op.getTensorDescType().getLayout() != layout)
       return rewriter.notifyMatchFailure(
           op, "conflicting layout attributes on tensor descriptor and anchor");
+    auto uArch = getUArch(xegpu::getChipStr(op).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          op, "xegpu::LoadNdOp require target attribute attached to "
+              "determine transpose "
+              "requirement");
     auto supportedWiResultTyOrFailure =
         xegpu::getDistributedVectorType(op.getTensorDescType());
     auto expectedWiResultTyOrFailure =
@@ -136,6 +143,11 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
         adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
         op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
         op.getL3HintAttr(), /**layout**/ nullptr);
+    // Set the packed attribute if the layout requires it.
+    newOp.setPacked(xegpu::requirePacked(cast<xegpu::LayoutAttr>(layout)));
+    // Set the transpose attribute if the layout requires it.
+    if (xegpu::requireTranspose(cast<xegpu::LayoutAttr>(layout), uArch))
+      newOp.setTranspose(DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
                                        expectedWiResultTyOrFailure.value()));
     return success();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8beadeb5da309..141247931a233 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -103,34 +103,6 @@ static Value resolveDistributedTy(Value orig, T expected,
   return orig;
 }
 
-/// Helper function to check if the layout is packed. Layout is packed if it is
-/// 2D and lane_data[0] != 1 (data packed from col dimension).
-/// TODO: Move to target info.
-static bool requirePacked(const xegpu::LayoutAttr layout) {
-  if (!layout)
-    return false;
-  auto laneData = layout.getEffectiveLaneDataAsInt();
-  if (laneData.size() != 2)
-    return false;
-  return laneData[0] != 1;
-}
-
-/// Helper function to check if the layout requires a transpose effect.
-static bool requireTranspose(const xegpu::LayoutAttr layout,
-                             const xegpu::uArch::uArch *uArch) {
-  // Return false for unsupported targets.
-  // TODO: Add more support or move to target info.
-  if (uArch->getName().equals_insensitive("pvc") &&
-      uArch->getName().equals_insensitive("bmg"))
-    return false;
-  if (!layout)
-    return false;
-  auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
-  if (laneLayout.size() != 2)
-    return false;
-  return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
-}
-
 /// Given a vector type and its distributed vector type, return the list of
 /// dimensions that are distributed.
 static SmallVector<int64_t> getDistributedDims(VectorType originalType,
@@ -533,9 +505,9 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
         newLoadOperands, loadOp->getAttrs());
     xegpu::removeLayoutAttrs(newLoadOp);
     // Set the packed attribute if the layout requires it.
-    newLoadOp.setPacked(requirePacked(layout));
+    newLoadOp.setPacked(xegpu::requirePacked(layout));
     // Set the transpose attribute if the layout requires it.
-    if (requireTranspose(layout, uArch))
+    if (xegpu::requireTranspose(layout, uArch))
       newLoadOp.setTranspose(
           DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 1a1f331efe608..062d0a872bc82 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/ValueRange.h"
@@ -762,3 +763,27 @@ template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
 template int
 xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
                                    ArrayRef<unsigned> candidateMultiples);
+
+bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
+  if (!layout)
+    return false;
+  auto laneData = layout.getEffectiveLaneDataAsInt();
+  if (laneData.size() != 2)
+    return false;
+  return laneData[0] != 1;
+}
+
+bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
+                             const xegpu::uArch::uArch *uArch) {
+  // Return false for unsupported targets.
+  // TODO: Add more support or move to target info.
+  if (uArch->getName().equals_insensitive("pvc") &&
+      uArch->getName().equals_insensitive("bmg"))
+    return false;
+  if (!layout)
+    return false;
+  auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+  if (laneLayout.size() != 2)
+    return false;
+  return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
+}

>From 5e9b1d0497cba45bee9486cccaca0f38d374e48a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 22:45:17 +0000
Subject: [PATCH 11/18] save work

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 39 ++++++---
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 87 ++++++++++++++++++-
 .../Dialect/XeGPU/sg-to-wi-experimental.mlir  | 45 ++++++++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 14 ++-
 4 files changed, 171 insertions(+), 14 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index c83c9a53c8734..efe8cbe8caedc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -86,8 +86,7 @@ static LogicalResult verifyLayouts(Operation *root) {
   return walkResult.wasInterrupted() ? failure() : success();
 }
 
-struct CreateNdDescOpPattern
-    : public OpConversionPattern<xegpu::CreateNdDescOp> {
+struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
 
   LogicalResult
@@ -106,7 +105,7 @@ struct CreateNdDescOpPattern
   }
 };
 
-struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
+struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
   using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
 
   LogicalResult
@@ -154,7 +153,7 @@ struct LoadNdOpPattern : public OpConversionPattern<xegpu::LoadNdOp> {
   }
 };
 
-struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
+struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
   using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
 
   LogicalResult
@@ -192,7 +191,7 @@ struct StoreNdOpPattern : public OpConversionPattern<xegpu::StoreNdOp> {
   }
 };
 
-struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
+struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
   using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
 
   LogicalResult
@@ -242,8 +241,8 @@ struct DpasOpPattern : public OpConversionPattern<xegpu::DpasOp> {
   }
 };
 
-struct ElementWiseOpPattern : public ConversionPattern {
-  ElementWiseOpPattern(TypeConverter &typeConverter, MLIRContext *ctx)
+struct WgToWiElementWise : public ConversionPattern {
+  WgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
       : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
 
   LogicalResult
@@ -287,7 +286,7 @@ struct ElementWiseOpPattern : public ConversionPattern {
   }
 };
 
-struct ArithConstantOpPattern : public OpConversionPattern<arith::ConstantOp> {
+struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
   using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
 
   LogicalResult
@@ -327,6 +326,26 @@ struct ArithConstantOpPattern : public OpConversionPattern<arith::ConstantOp> {
   }
 };
 
+struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
+  using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(xegpu::PrefetchNdOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    xegpu::DistributeLayoutAttr layout = op.getAnchorLayout();
+    // If no layout, nothing to do.
+    if (!layout)
+      return failure();
+
+    xegpu::PrefetchNdOp::create(rewriter, op.getLoc(), adaptor.getTensorDesc(),
+                                op.getMixedOffsets(), op.getL1HintAttr(),
+                                op.getL2HintAttr(), op.getL3HintAttr(),
+                                /**layout**/ nullptr);
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 struct XeGPUSgToWiDistributeExperimentalPass
     : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
           XeGPUSgToWiDistributeExperimentalPass> {
@@ -521,7 +540,7 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op->getResult(0)));
       });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
-  patterns.add<CreateNdDescOpPattern, LoadNdOpPattern, StoreNdOpPattern,
-               DpasOpPattern, ElementWiseOpPattern, ArithConstantOpPattern>(
+  patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
+               WgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
       typeConverter, patterns.getContext());
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index c89586f450164..715b327ba62a1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -2,7 +2,12 @@
 // RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' \
 // --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
 
+
+
 gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @create_nd_tdesc
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[TD:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
 gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16>
@@ -10,6 +15,9 @@ gpu.func @create_nd_tdesc(%arg0: memref<256x256xf16>) {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @cerate_nd_tedesc_nonmemref_source
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[TD:.*]] = xegpu.create_nd_tdesc %{{.*}}, shape : [256, 256], strides : [256, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
 gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0, shape : [256, 256], strides : [256, 1] : ui64
@@ -17,6 +25,10 @@ gpu.func @cerate_nd_tedesc_nonmemref_source(%arg0: ui64) {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @load_nd
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
 gpu.func @load_nd() {
   %c0 = arith.constant 0 : index
   %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -25,6 +37,36 @@ gpu.func @load_nd() {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @load_nd_packed
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
+gpu.func @load_nd_packed() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+  %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @load_nd_transpose
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
+// CHECK: %[[CAST:.*]] = vector.shape_cast %[[LOAD]] : vector<8xf32> to vector<1x8xf32>
+gpu.func @load_nd_transpose() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @store_nd
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf16> to vector<16x1xf16>
+// CHECK: %[[CAST3:.*]] = vector.shape_cast %[[CAST2]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[CAST3]], %{{.*}}[%[[C0]], %[[C0]]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
 gpu.func @store_nd() {
   %c0 = arith.constant 0 : index
   %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -36,7 +78,20 @@ gpu.func @store_nd() {
   gpu.return
 }
 
-gpu.func @dpas_op() {
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
+// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD0]] : vector<8xf16> to vector<8x1xf16>
+// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[CAST3:.*]] = vector.shape_cast %[[LOAD1]] : vector<16xf16> to vector<16x1xf16>
+// CHECK: %[[CAST4:.*]] = vector.shape_cast %[[CST]] : vector<8x1xf32> to vector<8xf32>
+// CHECK: %[[CAST5:.*]] = vector.shape_cast %[[CAST3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: %[[CAST6:.*]] = vector.shape_cast %[[CAST2]] : vector<8x1xf16> to vector<8xf16>
+// CHECK: %[[DPAS:.*]] = xegpu.dpas %[[CAST6]], %[[CAST5]], %[[CAST4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK: %[[CAST7:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK: gpu.return
+gpu.func @dpas() {
   %c0 = arith.constant 0 : index
   %0 = "some_op"() : () -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   %1 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
@@ -54,7 +109,14 @@ gpu.func @dpas_op() {
   gpu.return
 }
 
-gpu.func @elementwise_op() {
+// CHECK-LABEL: gpu.func @elementwise
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf32>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf32> -> vector<16xf32>
+// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD]] : vector<16xf32> to vector<16x1xf32>
+// CHECK: %[[ADD:.*]] = arith.addf %[[CAST2]], %[[CST]] : vector<16x1xf32>
+// CHECK: gpu.return
+gpu.func @elementwise() {
   %c0 = arith.constant 0 : index
   %0 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
     dense<1.0> : vector<16x16xf32>
@@ -66,4 +128,25 @@ gpu.func @elementwise_op() {
     : vector<16x16xf32>
   gpu.return
 }
+
+// CHECK-LABEL: gpu.func @arith_constant
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf32>
+// CHECK: gpu.return
+gpu.func @arith_constant() {
+  %0 = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    dense<1.0> : vector<16x16xf32>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @prefetch_nd
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: xegpu.prefetch_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<16x16xf16>
+// CHECK: gpu.return
+gpu.func @prefetch_nd() {
+  %c0 = arith.constant 0 : index
+  %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  xegpu.prefetch_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
 }
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
new file mode 100644
index 0000000000000..c54d56128f21a
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -0,0 +1,45 @@
+gpu.module @xevm_module{
+gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c8 = arith.constant 8 : index
+  %c1024 = arith.constant 1024 : index
+  %block_id_x = gpu.block_id  x
+  %block_id_y = gpu.block_id  y
+  %0 = arith.muli %block_id_x, %c8 : index
+  %1 = arith.muli %block_id_y, %c16 : index
+  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2[%0, %1]
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+    layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+    %7 = xegpu.load_nd %5[%0, %arg3]
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %8 = xegpu.load_nd %6[%arg3, %1]
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+
+    %9 = xegpu.dpas %7, %8, %arg4
+      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 03a11908499e1..99cc09f30c821 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -254,6 +254,9 @@ struct TestXeGPUSGDistribute
   }
 };
 
+/// This test pass is intended to test the subgroup to workitem distribution of
+/// xegpu/vector/arith operations in isolation, it does not handle any
+/// structural ops like scf.for etc.
 struct TestXeGPUSgToWiDistributeExperimental
     : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -285,12 +288,19 @@ struct TestXeGPUSgToWiDistributeExperimental
   void runOnOperation() override {
     MLIRContext *ctx = &getContext();
     TypeConverter typeConverter;
+    // Define type materializations using UnrealizedConversionCastOp.
+    auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type,
+                               mlir::ValueRange inputs,
+                               mlir::Location loc) -> mlir::Value {
+      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
+          .getResult(0);
+    };
+    typeConverter.addSourceMaterialization(materializeCast);
+    typeConverter.addTargetMaterialization(materializeCast);
     ConversionTarget target(*ctx);
     RewritePatternSet patterns(ctx);
     xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         typeConverter, patterns, target);
-    scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
-                                                         patterns, target);
     (void)applyPartialConversion(getOperation(), target, std::move(patterns));
   }
 };

>From 55436910a83aee6fa9a346d89c81b07de29e64d6 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 23:40:55 +0000
Subject: [PATCH 12/18] fix

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   2 +-
 .../XeGPUSgToWiDistributeExperimental.cpp     |  11 +-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |   4 +-
 .../Dialect/XeGPU/sg-to-wi-experimental.mlir  | 174 ++++++++++++++++++
 4 files changed, 186 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 9f391332df37a..e40d4eb6f8b9a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -28,7 +28,7 @@ class LayoutAttr;
 class TensorDescType;
 
 namespace uArch {
-class uArch;
+struct uArch;
 } // namespace uArch
 } // namespace xegpu
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index efe8cbe8caedc..ba960ee00ed5d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
@@ -53,7 +54,7 @@ static Value castValueTo(ConversionPatternRewriter &rewriter,
       v.getType().getNumElements() == expectedTy.getNumElements())
     return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
 
-  // else create an unrealized cast.
+  // Else create an unrealized cast.
   auto newOp = UnrealizedConversionCastOp::create(rewriter, v.getLoc(),
                                                   expectedTy, ValueRange{v});
   return newOp.getResult(0);
@@ -272,7 +273,13 @@ struct WgToWiElementWise : public ConversionPattern {
 
     VectorType newResultType = wiShapeOrFailure.value();
     OperationState state(op->getLoc(), op->getName());
-    state.addOperands(operands);
+    // Cast the types of operands to the expected workitem types.
+    SmallVector<Value> newOperands =
+        llvm::map_to_vector(operands, [&](Value v) {
+          return castValueTo(rewriter, cast<TypedValue<VectorType>>(v),
+                             newResultType);
+        });
+    state.addOperands(newOperands);
     state.addTypes(newResultType);
     // Copy all attributes except for DistributeLayoutAttr.
     for (auto attr : op->getAttrs()) {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 715b327ba62a1..e9d374c8ca2f1 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -1,6 +1,6 @@
 
-// RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' \
-// --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+// RUN: mlir-opt  --xevm-attach-target='module=xevm_* chip=pvc' --allow-unregistered-dialect \
+// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
 
 
 
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index c54d56128f21a..35aa83dfb34af 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -1,3 +1,31 @@
+// RUN: mlir-opt --allow-unregistered-dialect --xevm-attach-target='module=xevm_* chip=pvc' \
+// RUN: --xegpu-sg-to-wi-distribute-experimental --split-input-file %s --canonicalize --cse | FileCheck %s
+
+// CHECK-LABEL: gpu.func @gemm
+// CHECK-DAG  : %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG  : %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG  : %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG  : %[[C1024:.*]] = arith.constant 1024 : index
+// CHECK      : %[[BID_X:.*]] = gpu.block_id  x
+// CHECK      : %[[BID_Y:.*]] = gpu.block_id  y
+// CHECK      : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK      : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK      : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK      : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK      : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK      : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK      : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK      : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
+// CHECK      :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK      :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK      :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK      :   %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK      :   %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK      :   scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
+// CHECK      : } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK      : %[[CAST_FOR:.*]] = vector.shape_cast %[[FOR]] : vector<8x1xf32> to vector<8xf32>
+// CHECK      : xegpu.store_nd %[[CAST_FOR]], %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK      : gpu.return
 gpu.module @xevm_module{
 gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index
@@ -42,4 +70,150 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
     !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
+
+// CHECK-LABEL: gpu.func @gemm_with_preop
+// CHECK-DAG  : %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG  : %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG  : %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG  : %[[C1024:.*]] = arith.constant 1024 : index
+// CHECK      : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x1xbf16>
+// CHECK      : %[[BID_X:.*]] = gpu.block_id  x
+// CHECK      : %[[BID_Y:.*]] = gpu.block_id  y
+// CHECK      : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK      : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK      : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK      : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK      : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK      : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK      : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK      : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
+// CHECK      :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK      :   %[[CAST_A:.*]] = vector.shape_cast %[[LOAD_A]] : vector<8xbf16> to vector<8x1xbf16>
+// CHECK      :   %[[PREOP:.*]] = arith.addf %[[CAST_A]], %[[CST]] : vector<8x1xbf16>
+// CHECK      :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK      :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK      :   %[[CAST_PREOP:.*]] = vector.shape_cast %[[PREOP]] : vector<8x1xbf16> to vector<8xbf16>
+// CHECK      :   %[[DPAS:.*]] = xegpu.dpas %[[CAST_PREOP]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK      :   %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK      :   scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
+// CHECK      : } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK      : %[[CAST_FOR:.*]] = vector.shape_cast %[[FOR]] : vector<8x1xf32> to vector<8xf32>
+// CHECK      : xegpu.store_nd %[[CAST_FOR]], %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK      : gpu.return
+gpu.func @gemm_with_preop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c8 = arith.constant 8 : index
+  %c1024 = arith.constant 1024 : index
+  %cst = arith.constant  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.0> : vector<8x16xbf16>
+  %block_id_x = gpu.block_id  x
+  %block_id_y = gpu.block_id  y
+  %0 = arith.muli %block_id_x, %c8 : index
+  %1 = arith.muli %block_id_y, %c16 : index
+  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2[%0, %1]
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+    layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+    %7 = xegpu.load_nd %5[%0, %arg3]
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %preop = arith.addf %7, %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>
+    %8 = xegpu.load_nd %6[%arg3, %1]
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+
+    %9 = xegpu.dpas %preop, %8, %arg4
+      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @gemm_with_postop
+// CHECK-DAG  : %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG  : %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG  : %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG  : %[[C1024:.*]] = arith.constant 1024 : index
+// CHECK      : %[[BID_X:.*]] = gpu.block_id  x
+// CHECK      : %[[BID_Y:.*]] = gpu.block_id  y
+// CHECK      : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK      : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK      : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK      : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK      : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK      : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK      : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK      : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
+// CHECK      :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK      :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK      :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK      :   %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK      :   %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
+// CHECK      :   scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
+// CHECK      : } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK      : %[[POSTOP:.*]] = math.exp %[[FOR]] : vector<8x1xf32>
+// CHECK      : %[[CAST_POSTOP:.*]] = vector.shape_cast %[[POSTOP]] : vector<8x1xf32> to vector<8xf32>
+// CHECK      : xegpu.store_nd %[[CAST_POSTOP]], %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.func @gemm_with_postop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c8 = arith.constant 8 : index
+  %c1024 = arith.constant 1024 : index
+  %block_id_x = gpu.block_id  x
+  %block_id_y = gpu.block_id  y
+  %0 = arith.muli %block_id_x, %c8 : index
+  %1 = arith.muli %block_id_y, %c16 : index
+  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2[%0, %1]
+    {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+    layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
+    %7 = xegpu.load_nd %5[%0, %arg3]
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %8 = xegpu.load_nd %6[%arg3, %1]
+      {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
+
+    %9 = xegpu.dpas %7, %8, %arg4
+      {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+       layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+       layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+  %postop = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
+  xegpu.store_nd %postop, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}: vector<8x16xf32>,
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
 }

>From a2ce36b4736bbf43bf608ad9664982cfa5a5e14c Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 22 Jan 2026 23:53:12 +0000
Subject: [PATCH 13/18] fix

---
 .../XeGPU/sg-to-wi-experimental-unit.mlir        | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index e9d374c8ca2f1..0e9843f4626d4 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -80,14 +80,14 @@ gpu.func @store_nd() {
 
 // CHECK-LABEL: gpu.func @dpas
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
-// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[CAST2:.*]] = vector.shape_cast %[[LOAD0]] : vector<8xf16> to vector<8x1xf16>
-// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[CAST3:.*]] = vector.shape_cast %[[LOAD1]] : vector<16xf16> to vector<16x1xf16>
-// CHECK: %[[CAST4:.*]] = vector.shape_cast %[[CST]] : vector<8x1xf32> to vector<8xf32>
-// CHECK: %[[CAST5:.*]] = vector.shape_cast %[[CAST3]] : vector<16x1xf16> to vector<16xf16>
-// CHECK: %[[CAST6:.*]] = vector.shape_cast %[[CAST2]] : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
+// CHECK-DAG: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK-DAG: %[[CAST2:.*]] = vector.shape_cast %[[LOAD0]] : vector<8xf16> to vector<8x1xf16>
+// CHECK-DAG: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}}[%[[C0]], %[[C0]]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG: %[[CAST3:.*]] = vector.shape_cast %[[LOAD1]] : vector<16xf16> to vector<16x1xf16>
+// CHECK-DAG: %[[CAST4:.*]] = vector.shape_cast %[[CST]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[CAST5:.*]] = vector.shape_cast %[[CAST3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG: %[[CAST6:.*]] = vector.shape_cast %[[CAST2]] : vector<8x1xf16> to vector<8xf16>
 // CHECK: %[[DPAS:.*]] = xegpu.dpas %[[CAST6]], %[[CAST5]], %[[CAST4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
 // CHECK: %[[CAST7:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
 // CHECK: gpu.return

>From 33b11c86b3abeedbd43da4e7e97e21ec381e861a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 23 Jan 2026 22:53:55 +0000
Subject: [PATCH 14/18] add comments

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index ba960ee00ed5d..88079b3edddf6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -46,10 +46,13 @@ using namespace mlir;
 
 namespace {
 
+/// Casts the given vector value `v` to the expected vector type `expectedTy`.
 static Value castValueTo(ConversionPatternRewriter &rewriter,
                          TypedValue<VectorType> v, VectorType expectedTy) {
+  // If the type matches, simply return the value itself.
   if (v.getType() == expectedTy)
     return v;
+  // If only shape differs, use shape cast.
   if (isa<VectorType>(v.getType()) &&
       v.getType().getNumElements() == expectedTy.getNumElements())
     return vector::ShapeCastOp::create(rewriter, v.getLoc(), expectedTy, v);
@@ -60,6 +63,8 @@ static Value castValueTo(ConversionPatternRewriter &rewriter,
   return newOp.getResult(0);
 }
 
+/// Checks if all XeGPU anchor ops and vector results have valid layouts.
+/// TODO: This function can be removed once the full layout refactoring is done.
 static LogicalResult verifyLayouts(Operation *root) {
   auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
@@ -87,6 +92,8 @@ static LogicalResult verifyLayouts(Operation *root) {
   return walkResult.wasInterrupted() ? failure() : success();
 }
 
+/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
+/// op. This simply drops the layout attribute from the tensor descriptor type.
 struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
   using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
 
@@ -106,6 +113,9 @@ struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
   }
 };
 
+/// Distributes a subgroup-level LoadNd op to workitem-level LoadNd op. Output
+/// of workitem-level LoadNd op is 1D. ShapeCast is added to restore the
+/// original rank.
 struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
   using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
 
@@ -154,6 +164,9 @@ struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
   }
 };
 
+/// Distributes a subgroup-level StoreNd op to workitem-level StoreNd op. Stored
+/// value in workitem-level StoreNd op is 1D. ShapeCast is added to cast the
+/// incoming value to 1D.
 struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
   using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
 
@@ -192,6 +205,9 @@ struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
   }
 };
 
+/// Distributes a subgroup-level Dpas op to workitem-level Dpas op. All inpputs
+/// and output of workitem-level Dpas op are 1D. Necessary casts are added to
+/// convert the inputs and output to/from 1D.
 struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
   using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
 
@@ -223,8 +239,6 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
       return rewriter.notifyMatchFailure(
           op, "unable to compute expected workitem vector type for DpasOp from "
               "lane layout");
-    // llvm::errs() << "adaptor acc type: " << adaptor.getAcc().getType() <<
-    // "\n"; llvm::errs() << "ops acc type: " << op.getAcc().getType() << "\n";
     auto newOp = xegpu::DpasOp::create(
         rewriter, op->getLoc(), wiResultTyOrFailure.value(),
         castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
@@ -242,6 +256,8 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
   }
 };
 
+/// Distributes elementwise ops to workitem-level elementwise ops. This
+/// currently handles elementwise ops with single result only.
 struct WgToWiElementWise : public ConversionPattern {
   WgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
       : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
@@ -293,6 +309,8 @@ struct WgToWiElementWise : public ConversionPattern {
   }
 };
 
+/// Distributes a subgroup-level arith ConstantOp to workitem-level arith
+/// ConstantOp.
 struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
   using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
 
@@ -333,6 +351,7 @@ struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
   }
 };
 
+/// Distributes a subgroup-level PrefetchNd op to workitem-level PrefetchNd op.
 struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
   using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
 

>From 1f2eee1d5be8be4b4895307684a6301aa2c3cc62 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 23 Jan 2026 22:59:57 +0000
Subject: [PATCH 15/18] add comments

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 88079b3edddf6..4e38c7de6bd8e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -64,7 +64,6 @@ static Value castValueTo(ConversionPatternRewriter &rewriter,
 }
 
 /// Checks if all XeGPU anchor ops and vector results have valid layouts.
-/// TODO: This function can be removed once the full layout refactoring is done.
 static LogicalResult verifyLayouts(Operation *root) {
   auto walkResult = root->walk([&](Operation *nestedOp) -> WalkResult {
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(nestedOp)) {
@@ -383,6 +382,7 @@ struct XeGPUSgToWiDistributeExperimentalPass
 void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
   // Verify if all XeGPU anchor ops and vector ops have result layouts.
+  // TODO: This can be removed once the full layout refactoring is done.
   Operation *root = getOperation();
   if (failed(verifyLayouts(root))) {
     LLVM_DEBUG(DBGS() << "XeGPUSgToWiDistributeExperimentalPass: layout "
@@ -432,6 +432,11 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
   // UnrealizedConversionCastOps to materialize the SG type from type converted
   // WI type. These are redundant at this point and can be eliminated by
   // inserting shape casts instead.
+  // Example:
+  // %1 = UnrealizedConversionCastOp %0 : vector<16x1xf32> to vector<16x16xf32>
+  // %2 = UnrealizedConversionCastOp %1 : vector<16x16xf32> to vector<16xf32>
+  // This can be replaced with:
+  // %2 = vector.shape_cast %0 : vector<16x1xf32> to vector<16xf32>
   OpBuilder builder(root);
   root->walk([&](UnrealizedConversionCastOp op) {
     // If this op existed before, nothing to do.
@@ -485,22 +490,21 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
 
 void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
     TypeConverter &typeConverter) {
-  // Populate type conversions.
-  // - Any type other than TensorDescType and VectorType are legal as is.
+  // Any type other than TensorDescType and VectorType are legal as is.
   typeConverter.addConversion([](Type type) -> std::optional<Type> {
     if (!isa<TensorDescType, VectorType>(type))
       return type;
     return std::nullopt;
   });
-  // - For TensorDescType, drop the layout attribute if any.
+  // For TensorDescType, drop the layout attribute if any.
   typeConverter.addConversion([](TensorDescType type) -> Type {
     if (type.getLayoutAttr()) {
       return type.dropLayouts();
     }
     return type;
   });
-  // - For VectorType, check if there is a distribute layout attribute on the
-  //   value. If so, convert to the distributed vector type based on the layout.
+  // For VectorType, check if there is a distribute layout attribute on the
+  // value. If so, convert to the distributed vector type based on the layout.
   typeConverter.addConversion([](Value v) -> std::optional<Type> {
     auto type = v.getType();
     // If value is not vector type, nothing to do.
@@ -522,26 +526,26 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target) {
   populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
-  // - CreateNdDescOp is legal only if its result type has no layout attribute.
+  // CreateNdDescOp is legal only if its result type has no layout attribute.
   target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
       [&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
-  // - Any anchor XeGPU op is legal only if it has no anchor layout.
+  // Any anchor XeGPU op is legal only if it has no anchor layout.
   target.addDynamicallyLegalDialect<xegpu::XeGPUDialect>([](Operation *op) {
     auto anchorOp = dyn_cast<AnchorLayoutInterface>(op);
     if (!anchorOp)
       return true;
     return !anchorOp.getAnchorLayout();
   });
+  // Arith constants are legal only if they have no temporary layout attribute.
   target.addDynamicallyLegalOp<arith::ConstantOp>(
       [=](arith::ConstantOp op) -> bool {
         // If the result type is not a vector, it's legal.
         if (!isa<VectorType>(op.getResult().getType()))
           return true;
-        // For vector result types, check if it has a layout attribute.
         return !xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
       });
-  // - In math and arith dialects, only handle elementwise ops with a single
-  //   result and with a result layout attribute.
+  // In math and arith dialects, only handle elementwise ops with a single
+  // result and with a result layout attribute.
   target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
       [=](Operation *op) -> std::optional<bool> {
         // Only handle elementwise mappable ops

>From 678162c152d790d1d7930b4c671425f25adda842 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 24 Jan 2026 00:06:58 +0000
Subject: [PATCH 16/18] add comments

---
 .../Dialect/XeGPU/sg-to-wi-experimental.mlir  | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
index 35aa83dfb34af..9172cd3018b71 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
@@ -6,19 +6,19 @@
 // CHECK-DAG  : %[[C16:.*]] = arith.constant 16 : index
 // CHECK-DAG  : %[[C8:.*]] = arith.constant 8 : index
 // CHECK-DAG  : %[[C1024:.*]] = arith.constant 1024 : index
-// CHECK      : %[[BID_X:.*]] = gpu.block_id  x
-// CHECK      : %[[BID_Y:.*]] = gpu.block_id  y
-// CHECK      : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
-// CHECK      : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK-DAG  : %[[BID_X:.*]] = gpu.block_id  x
+// CHECK-DAG  : %[[BID_Y:.*]] = gpu.block_id  y
+// CHECK-DAG  : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK-DAG  : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
 // CHECK      : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK      : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK      : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
-// CHECK      : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK      : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG  : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-DAG  : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG  : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
 // CHECK      : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
-// CHECK      :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK      :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK      :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG  :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG  :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG  :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
 // CHECK      :   %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
 // CHECK      :   %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
 // CHECK      :   scf.yield %[[CAST_DPAS]] : vector<8x1xf32>
@@ -77,21 +77,21 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
 // CHECK-DAG  : %[[C8:.*]] = arith.constant 8 : index
 // CHECK-DAG  : %[[C1024:.*]] = arith.constant 1024 : index
 // CHECK      : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x1xbf16>
-// CHECK      : %[[BID_X:.*]] = gpu.block_id  x
-// CHECK      : %[[BID_Y:.*]] = gpu.block_id  y
-// CHECK      : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
-// CHECK      : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK-DAG  : %[[BID_X:.*]] = gpu.block_id  x
+// CHECK-DAG  : %[[BID_Y:.*]] = gpu.block_id  y
+// CHECK-DAG  : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK-DAG  : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
 // CHECK      : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK      : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK      : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
-// CHECK      : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK      : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG  : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-DAG  : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG  : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
 // CHECK      : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
-// CHECK      :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG  :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
 // CHECK      :   %[[CAST_A:.*]] = vector.shape_cast %[[LOAD_A]] : vector<8xbf16> to vector<8x1xbf16>
 // CHECK      :   %[[PREOP:.*]] = arith.addf %[[CAST_A]], %[[CST]] : vector<8x1xbf16>
-// CHECK      :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK      :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG  :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG  :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
 // CHECK      :   %[[CAST_PREOP:.*]] = vector.shape_cast %[[PREOP]] : vector<8x1xbf16> to vector<8xbf16>
 // CHECK      :   %[[DPAS:.*]] = xegpu.dpas %[[CAST_PREOP]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
 // CHECK      :   %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
@@ -151,19 +151,19 @@ gpu.func @gemm_with_preop(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024
 // CHECK-DAG  : %[[C16:.*]] = arith.constant 16 : index
 // CHECK-DAG  : %[[C8:.*]] = arith.constant 8 : index
 // CHECK-DAG  : %[[C1024:.*]] = arith.constant 1024 : index
-// CHECK      : %[[BID_X:.*]] = gpu.block_id  x
-// CHECK      : %[[BID_Y:.*]] = gpu.block_id  y
-// CHECK      : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
-// CHECK      : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
+// CHECK-DAG  : %[[BID_X:.*]] = gpu.block_id  x
+// CHECK-DAG  : %[[BID_Y:.*]] = gpu.block_id  y
+// CHECK-DAG  : %[[MUL_X:.*]] = arith.muli %[[BID_X]], %[[C8]] : index
+// CHECK-DAG  : %[[MUL_Y:.*]] = arith.muli %[[BID_Y]], %[[C16]] : index
 // CHECK      : %[[TD_C:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK      : %[[LOAD_C:.*]] = xegpu.load_nd %[[TD_C]][%[[MUL_X]], %[[MUL_Y]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK      : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
-// CHECK      : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK      : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG  : %[[CAST_C:.*]] = vector.shape_cast %[[LOAD_C]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-DAG  : %[[TD_A:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG  : %[[TD_B:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
 // CHECK      : %[[FOR:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C1024]] step %[[C16]] iter_args(%[[ACC:.*]] = %[[CAST_C]]) -> (vector<8x1xf32>) {
-// CHECK      :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK      :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK      :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG  :   %[[LOAD_A:.*]] = xegpu.load_nd %[[TD_A]][%[[MUL_X]], %[[IV]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG  :   %[[LOAD_B:.*]] = xegpu.load_nd %[[TD_B]][%[[IV]], %[[MUL_Y]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG  :   %[[CAST_ACC:.*]] = vector.shape_cast %[[ACC]] : vector<8x1xf32> to vector<8xf32>
 // CHECK      :   %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]], %[[CAST_ACC]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
 // CHECK      :   %[[CAST_DPAS:.*]] = vector.shape_cast %[[DPAS]] : vector<8xf32> to vector<8x1xf32>
 // CHECK      :   scf.yield %[[CAST_DPAS]] : vector<8x1xf32>

>From 5a2bf18a0fa4b1990105cb6a072a15cb99a92145 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 27 Jan 2026 19:42:57 +0000
Subject: [PATCH 17/18] fix

---
 .../XeGPUSgToWiDistributeExperimental.cpp     | 21 +++----------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 4e38c7de6bd8e..67f8cf633b849 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -288,13 +288,7 @@ struct WgToWiElementWise : public ConversionPattern {
 
     VectorType newResultType = wiShapeOrFailure.value();
     OperationState state(op->getLoc(), op->getName());
-    // Cast the types of operands to the expected workitem types.
-    SmallVector<Value> newOperands =
-        llvm::map_to_vector(operands, [&](Value v) {
-          return castValueTo(rewriter, cast<TypedValue<VectorType>>(v),
-                             newResultType);
-        });
-    state.addOperands(newOperands);
+    state.addOperands(operands);
     state.addTypes(newResultType);
     // Copy all attributes except for DistributeLayoutAttr.
     for (auto attr : op->getAttrs()) {
@@ -413,16 +407,6 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
     xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
     scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
                                                          patterns, target);
-    target.addLegalOp<UnrealizedConversionCastOp>();
-    (void)applyPartialConversion(root, target, std::move(patterns));
-  }
-  // Apply the XeGPU subgroup to workitem distribution patterns.
-  {
-    ConversionTarget target(getContext());
-    TypeConverter typeConverter;
-    typeConverter.addTargetMaterialization(materializeCast);
-    typeConverter.addSourceMaterialization(materializeCast);
-    RewritePatternSet patterns(&getContext());
     xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
         typeConverter, patterns, target);
     target.addLegalOp<UnrealizedConversionCastOp>();
@@ -453,7 +437,8 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
       return;
 
     // Check if the defining op of the input is also an
-    // UnrealizedConversionCastOp and it has a single user (which is this op).
+    // UnrealizedConversionCastOp and it has a single user (which is this
+    // op).
     auto definingOp = singleInput.getDefiningOp<UnrealizedConversionCastOp>();
     if (!definingOp || !definingOp->hasOneUse())
       return;

>From d548f9507674c4156003a083fdfedc3406c386dc Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 28 Jan 2026 21:52:11 +0000
Subject: [PATCH 18/18] address issues

---
 mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h          | 2 --
 .../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp  | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index e40d4eb6f8b9a..700db5f9dd9be 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -71,8 +71,6 @@ FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
 /// according to the lane_layout. We simply divide each dimension of tensor
 /// descriptor shape by corresponding lane_layout dimension. If
 /// array_length > 1, that is appended to the front of the distributed shape.
-/// NOTE: This is the vector type that will be returned by the
-/// gpu.warp_execute_on_lane0 op.
 ///
 /// Examples:
 /// | original vector shape | lane_layout | distributed vector shape |
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 67f8cf633b849..4ae858363d5b6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -257,8 +257,8 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
 
 /// Distributes elementwise ops to workitem-level elementwise ops. This
 /// currently handles elementwise ops with single result only.
-struct WgToWiElementWise : public ConversionPattern {
-  WgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
+struct SgToWiElementWise : public ConversionPattern {
+  SgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
       : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
 
   LogicalResult
@@ -556,6 +556,6 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
       });
   target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
   patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
-               WgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
+               SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd>(
       typeConverter, patterns.getContext());
 }