[Mlir-commits] [mlir] [MLIR][XeGPU] Add blocking pass [1/N] (PR #140163)

Fri May 16 08:06:48 PDT 2025

https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/140163

>From 777a403f896d811dbe36a7aed6ccacf6adf9c833 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 12 May 2025 19:36:58 +0000
Subject: [PATCH 01/11] add utils

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     | 15 +++++++
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 27 +++++--------
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 40 +++++++++++++++++++
 3 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 3616fa614e7f9..5c2a308887040 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -13,6 +13,9 @@
 namespace mlir {
 
 class VectorType;
+class OpOperand;
+class OpResult;
+
 namespace xegpu {
 class LayoutAttr;
 class TensorDescType;
@@ -50,6 +53,18 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
 FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
                                                LayoutAttr layout);
 
+/// Retrieves the LayoutAttr associated with a given Value. For TensorDescType
+/// values, the LayoutAttr is extracted from the TensorDescType itself. For
+/// other values, it is obtained from the attributes of the defining operation.
+/// Returns nullptr if no LayoutAttr is found.
+LayoutAttr getLayoutAttr(Value value);
+
+/// Retrieves the name for the LayoutAttr associated with a given OpOperand.
+std::string getLayoutName(OpOperand &opr);
+
+/// Retrieves the name for the LayoutAttr associated with a given OpResult.
+std::string getLayoutName(OpResult res);
+
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 2300d9e3bd43f..ca887bd0fb7b5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -62,8 +62,6 @@ constexpr unsigned packedSizeInBitsForDefault =
     16; // Minimum packing size per register for DPAS A.
 constexpr unsigned packedSizeInBitsForDpasB =
     32; // Minimum packing size per register for DPAS B.
-static const char *const operandLayoutNamePrefix = "layout_operand_";
-static const char *const resultLayoutNamePrefix = "layout_result_";
 
 namespace {
 
@@ -728,10 +726,7 @@ class LayoutAttrAssignment {
 void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
   for (OpOperand &user : v.getUses()) {
     Operation *owner = user.getOwner();
-    unsigned operandNumber = user.getOperandNumber();
-    // Use a generic name for ease of querying the layout attribute later.
-    std::string attrName =
-        operandLayoutNamePrefix + std::to_string(operandNumber);
+    std::string attrName = xegpu::getLayoutName(user);
     owner->setAttr(attrName, layout);
   }
 }
@@ -805,10 +800,10 @@ LogicalResult LayoutAttrAssignment::assign(Operation *op) {
     return success();
   }
   // Otherwise simply attach the layout to the op itself.
-  for (auto [i, r] : llvm::enumerate(op->getResults())) {
+  for (auto r : op->getOpResults()) {
     xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r);
     if (layoutInfo) {
-      std::string attrName = resultLayoutNamePrefix + std::to_string(i);
+      std::string attrName = xegpu::getLayoutName(r);
       op->setAttr(attrName, layoutInfo);
       // Attach the layout attribute to the users of the result.
       assignToUsers(r, layoutInfo);
@@ -928,11 +923,8 @@ static SmallVector<NamedAttribute>
 removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
   SmallVector<NamedAttribute> newAttrs;
   for (NamedAttribute attr : attrs) {
-    if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
-        attr.getName().strref().contains(resultLayoutNamePrefix)) {
-      continue;
-    }
-    newAttrs.push_back(attr);
+    if (!isa<xegpu::LayoutAttr>(attr.getValue()))
+      newAttrs.push_back(attr);
   }
   return newAttrs;
 }
@@ -1335,11 +1327,10 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
 
     auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
     unsigned operandIdx = operand->getOperandNumber();
-    std::string layoutAName =
-        llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str();
-    std::string layoutBName =
-        llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str();
-    auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
+    std::string layoutAName = xegpu::getLayoutName(dpasOp->getOpOperand(0));
+    std::string layoutBName = xegpu::getLayoutName(dpasOp->getOpOperand(1));
+    std::string layoutCName = xegpu::getLayoutName(dpasOp->getOpResult(0));
+
     xegpu::LayoutAttr layoutA =
         dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
     xegpu::LayoutAttr layoutB =
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 6b45ed0ae4ced..d101ce07043ec 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -12,6 +12,8 @@
 
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
 
@@ -83,3 +85,41 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
       /*memory_space=*/xegpu::MemorySpace::Global, layout);
   return xegpu::getDistributedVectorType(helperTdescTy);
 }
+
+xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
+  if (!value)
+    return LayoutAttr();
+
+  if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(value.getType()))
+    return tdescTy.getLayoutAttr();
+
+  if (auto result = dyn_cast<OpResult>(value)) {
+    Operation *defOp = result.getDefiningOp();
+    assert(defOp && "result must have a defining op");
+    std::string layoutName = getLayoutName(result);
+    if (defOp->hasAttr(layoutName))
+      return defOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
+  }
+
+  if (auto arg = dyn_cast<BlockArgument>(value)) {
+    auto parentOp = arg.getOwner()->getParentOp();
+    if (auto funcOp = dyn_cast<FuncOp>(parentOp)) {
+      std::string layoutName = getLayoutName(arg);
+      if (funcOp->hasAttr(layoutName))
+        return funcOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
+    }
+  }
+
+  return nullptr;
+}
+
+std::string xegpu::getLayoutName(OpOperand &opr) {
+  const StringRef prefix("layout_operand_");
+  return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str();
+}
+
+std::string xegpu::getLayoutName(OpResult res) {
+  const StringRef prefix = "layout_result_";
+  return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str();
+}
+

>From af01c99481e1a88fef78b2517cf9b2f531acbd9f Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 12 May 2025 19:37:07 +0000
Subject: [PATCH 02/11] add skeleton

---
 mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td | 12 ++++++++++++
 mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt     |  1 +
 2 files changed, 13 insertions(+)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3e81f2d0ed786..54782933fe5f8 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -38,4 +38,16 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
   ];
 }
 
+def XeGPUInstructionlize: Pass<"xegpu-instructionlize"> {
+  let summary = "Instructionlize XeGPU ops";
+  let description = [{
+    The pass unrolls XeGPU ops working on large shapes into ops working on small shapes
+    (given by the inst_data in the layout attr), such that each of them can be dispatch
+    into a hardware instruction.
+  }];
+  let dependentDialects = [
+      "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
+  ];
+}
+
 #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 892eb791c46e7..1d94b4c4c03ac 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUFoldAliasOps.cpp
+  XeGPUInstructionlize.cpp
   XeGPUSubgroupDistribute.cpp
   XeGPUUnroll.cpp
 

>From e8b43fbfe2b3764dc804b13975154b0f584c7d9b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 13 May 2025 00:44:02 +0000
Subject: [PATCH 03/11] add filter

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td |  4 ++++
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp      | 16 ++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 032ce5bc18334..3f5fe2cce4636 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -295,11 +295,15 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     }
 
     LayoutAttr dropSgLayoutAndData() {
+      if (!getInstData() && !getLaneLayout())
+        return nullptr;
       return LayoutAttr::get(getContext(), nullptr, nullptr, getInstData(),
                              getLaneLayout(), getLaneData(), getOrder());
     }
 
     LayoutAttr dropInstData() {
+      if (!getSgLayout() && !getLaneLayout())
+        return nullptr;
       return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr,
                              getLaneLayout(), getLaneData(), getOrder());
     }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index d101ce07043ec..285a15062e402 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
@@ -88,7 +89,7 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
 
 xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
   if (!value)
-    return LayoutAttr();
+    return nullptr;
 
   if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(value.getType()))
     return tdescTy.getLayoutAttr();
@@ -96,6 +97,11 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
+
+    // for LoadNdOp, the layout is stored in the tensor descriptor
+    if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
+      return getLayoutAttr(loadNd.getTensorDesc());
+
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -103,10 +109,9 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto parentOp = arg.getOwner()->getParentOp();
-    if (auto funcOp = dyn_cast<FuncOp>(parentOp)) {
-      std::string layoutName = getLayoutName(arg);
-      if (funcOp->hasAttr(layoutName))
-        return funcOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
+    if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
+      OpOperand *tiedInit = loop.getTiedLoopInit(arg);
+      return getLayoutAttr(tiedInit->get());
     }
   }
 
@@ -122,4 +127,3 @@ std::string xegpu::getLayoutName(OpResult res) {
   const StringRef prefix = "layout_result_";
   return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str();
 }
-

>From 3f73fda71e833ef844eec19bd2eda0f3b6b31020 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 13 May 2025 01:06:29 +0000
Subject: [PATCH 04/11] clean up

---
 .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 143 ++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
new file mode 100644
index 0000000000000..b83ce86a357f0
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -0,0 +1,143 @@
+//===---- XeGPUInstructionlize.cpp -- XeGPU Instructionlize Pass ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUINSTRUCTIONLIZE
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-instructionlize"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+
+/// Unroll XeGPU ops to their instruction-level representation.
+class XeGPUInstructionlizePass final
+    : public xegpu::impl::XeGPUInstructionlizeBase<XeGPUInstructionlizePass> {
+public:
+  void runOnOperation() override;
+
+private:
+  SmallVector<int64_t> getTileShape(TypedValue<ShapedType> value) const;
+  std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;
+  bool needsUnroll(Operation *op) const;
+};
+} // namespace
+
+SmallVector<int64_t>
+XeGPUInstructionlizePass::getTileShape(TypedValue<ShapedType> value) const {
+  assert(value && "value must be non-null");
+  xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value);
+  if (layout && layout.isSgLayout()) {
+    if (auto inst_data = layout.getInstData())
+      return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+  }
+  return llvm::to_vector(value.getType().getShape());
+}
+
+std::optional<SmallVector<int64_t>>
+XeGPUInstructionlizePass::getTileShape(Operation *op) const {
+  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(op))
+    return getTileShape(cast<TypedValue<ShapedType>>(op->getResult(0)));
+  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp>(op))
+    return getTileShape(cast<TypedValue<ShapedType>>(op->getOperand(0)));
+  if (isa<xegpu::StoreNdOp>(op))
+    return getTileShape(cast<TypedValue<ShapedType>>(op->getOperand(1)));
+
+  if (isa<xegpu::DpasOp>(op)) {
+    auto a = cast<TypedValue<ShapedType>>(op->getOperand(0));
+    auto b = cast<TypedValue<ShapedType>>(op->getOperand(1));
+    SmallVector<int64_t> aTileShape = getTileShape(a);
+    SmallVector<int64_t> bTileShape = getTileShape(b);
+
+    if (aTileShape.size() != 2 || bTileShape.size() != 2)
+      return std::nullopt;
+
+    // semantic check for A and B
+    if (aTileShape[1] != bTileShape[0])
+      return std::nullopt;
+
+    // semantic check for C
+    if (op->getNumOperands() == 3) {
+      auto c = cast<TypedValue<ShapedType>>(op->getOperand(2));
+      SmallVector<int64_t> cTileShape = getTileShape(c);
+      int64_t expectedShape[2] = {aTileShape[0], bTileShape[1]};
+      if (!llvm::equal(cTileShape, expectedShape))
+        return std::nullopt;
+    }
+
+    return SmallVector<int64_t>({aTileShape[0], aTileShape[1], bTileShape[1]});
+  }
+  return std::nullopt;
+}
+
+bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const {
+  for (Value opr : op->getOperands()) {
+    if (auto value = dyn_cast<TypedValue<ShapedType>>(opr)) {
+      auto tileShape = getTileShape(value);
+      // the tile should have the same rank as the origial type
+      if (tileShape.size() != static_cast<size_t>(value.getType().getRank()))
+        return false;
+      if (!llvm::equal(tileShape, value.getType().getShape()))
+        return true;
+    }
+  }
+  return false;
+}
+
+void XeGPUInstructionlizePass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  xegpu::UnrollOptions options;
+  options.setFilterConstraint([&](Operation *op) -> LogicalResult {
+    return needsUnroll(op) ? success() : failure();
+  });
+
+  options.setNativeShapeFn(
+      [&](Operation *op) -> std::optional<SmallVector<int64_t>> {
+        return getTileShape(op);
+      });
+
+  options.setUnrolledTypesFn(
+      [&](ShapedType type, ArrayRef<int64_t> tileShape) -> SmallVector<Type> {
+        Type elemTy = type.getElementType();
+        Type newTy;
+
+        if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type))
+          newTy = xegpu::TensorDescType::get(
+              ctx, tileShape, elemTy, tdescTy.getEncoding(),
+              tdescTy.getLayoutAttr().dropInstData());
+        else
+          newTy = type.clone(tileShape, elemTy);
+
+        std::optional<SmallVector<int64_t>> ratio =
+            computeShapeRatio(type.getShape(), tileShape);
+        assert(ratio &&
+               "The shape of the type must be a multiple of tileShape.");
+        return SmallVector<Type>(computeProduct(*ratio), newTy);
+      });
+
+  RewritePatternSet patterns(ctx);
+
+  populateXeGPUUnrollPatterns(patterns, options);
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+}

>From ab448a34294bf2333af8ed52e6d4db540706d20f Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 13 May 2025 18:45:16 +0000
Subject: [PATCH 05/11] add scf type conversion util

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   5 +
 .../XeGPU/Transforms/XeGPUInstructionlize.cpp |  41 ++--
 mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt   |   1 +
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 182 ++++++++++++++++++
 4 files changed, 215 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 5c2a308887040..4bcda3e3ac95f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -65,6 +65,11 @@ std::string getLayoutName(OpOperand &opr);
 /// Retrieves the name for the LayoutAttr associated with a given OpResult.
 std::string getLayoutName(OpResult res);
 
+/// Do type conversion for SCF structural ops, e.g., scf.for. Since VectorType
+/// cannot carry the layout attribute, they are converted into RankedTensorType
+/// first, which will convert back to VectorType in the second round.
+void doSCFStructuralTypeConversionWithTensorType(Operation *op);
+
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
index b83ce86a357f0..efc44aadb14e6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -38,21 +38,33 @@ class XeGPUInstructionlizePass final
   void runOnOperation() override;
 
 private:
-  SmallVector<int64_t> getTileShape(TypedValue<ShapedType> value) const;
+  // Get the tile shape for a given value. If the value has a layout
+  // attribute and it is an SG layout, return the inst_data as the tile shape
+  // if inst_data is available; otherwise, return the original shape of the
+  // value. If the value does not have an SG layout, return std::nullopt.
+  std::optional<SmallVector<int64_t>>
+  getTileShape(TypedValue<ShapedType> value) const;
+
+  // Get the tile shape for a given operation.
   std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;
+
+  // Determine if the operation requires unrolling. Return false if all operands
+  // and results have tile shapes identical to their original types. Otherwise,
+  // return true.
   bool needsUnroll(Operation *op) const;
 };
 } // namespace
 
-SmallVector<int64_t>
+std::optional<SmallVector<int64_t>>
 XeGPUInstructionlizePass::getTileShape(TypedValue<ShapedType> value) const {
   assert(value && "value must be non-null");
   xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value);
   if (layout && layout.isSgLayout()) {
     if (auto inst_data = layout.getInstData())
       return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+    return llvm::to_vector(value.getType().getShape());
   }
-  return llvm::to_vector(value.getType().getShape());
+  return std::nullopt;
 }
 
 std::optional<SmallVector<int64_t>>
@@ -67,26 +79,26 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const {
   if (isa<xegpu::DpasOp>(op)) {
     auto a = cast<TypedValue<ShapedType>>(op->getOperand(0));
     auto b = cast<TypedValue<ShapedType>>(op->getOperand(1));
-    SmallVector<int64_t> aTileShape = getTileShape(a);
-    SmallVector<int64_t> bTileShape = getTileShape(b);
+    std::optional<SmallVector<int64_t>> aTile = getTileShape(a);
+    std::optional<SmallVector<int64_t>> bTile = getTileShape(b);
 
-    if (aTileShape.size() != 2 || bTileShape.size() != 2)
+    if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
       return std::nullopt;
 
     // semantic check for A and B
-    if (aTileShape[1] != bTileShape[0])
+    if ((*aTile)[1] != (*bTile)[0])
       return std::nullopt;
 
     // semantic check for C
     if (op->getNumOperands() == 3) {
       auto c = cast<TypedValue<ShapedType>>(op->getOperand(2));
-      SmallVector<int64_t> cTileShape = getTileShape(c);
-      int64_t expectedShape[2] = {aTileShape[0], bTileShape[1]};
-      if (!llvm::equal(cTileShape, expectedShape))
+      std::optional<SmallVector<int64_t>> cTile = getTileShape(c);
+      int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
+      if (!cTile || !llvm::equal(*cTile, expectedCTile))
         return std::nullopt;
     }
 
-    return SmallVector<int64_t>({aTileShape[0], aTileShape[1], bTileShape[1]});
+    return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
   }
   return std::nullopt;
 }
@@ -94,11 +106,12 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const {
 bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const {
   for (Value opr : op->getOperands()) {
     if (auto value = dyn_cast<TypedValue<ShapedType>>(opr)) {
-      auto tileShape = getTileShape(value);
+      std::optional<SmallVector<int64_t>> tileShape = getTileShape(value);
       // the tile should have the same rank as the origial type
-      if (tileShape.size() != static_cast<size_t>(value.getType().getRank()))
+      if (!tileShape ||
+          tileShape->size() != static_cast<size_t>(value.getType().getRank()))
         return false;
-      if (!llvm::equal(tileShape, value.getType().getShape()))
+      if (!llvm::equal(*tileShape, value.getType().getShape()))
         return true;
     }
   }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
index afd8e2d5c4df3..98e84a4420722 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
@@ -6,5 +6,6 @@ add_mlir_dialect_library(MLIRXeGPUUtils
 
   LINK_LIBS PUBLIC
   MLIRIR
+  MLIRSCFTransforms
   MLIRXeGPUDialect
   )
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 285a15062e402..e43aac4ce8dc0 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -11,9 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
@@ -127,3 +130,182 @@ std::string xegpu::getLayoutName(OpResult res) {
   const StringRef prefix = "layout_result_";
   return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str();
 }
+
+void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
+  MLIRContext *context = op->getContext();
+
+  auto materializeCast = [&](OpBuilder &builder, Type type, ValueRange inputs,
+                             Location loc) -> Value {
+    return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+        .getResult(0);
+  };
+
+  { // convert VectorType to RankedTensorType for SCF Structural ops
+    TypeConverter converter;
+    converter.addConversion([&](Type type) -> Type { return type; });
+    converter.addConversion([&](VectorType type) -> Type {
+      return RankedTensorType::get(type.getShape(), type.getElementType());
+    });
+    converter.addSourceMaterialization(materializeCast);
+    converter.addTargetMaterialization(materializeCast);
+
+    mlir::ConversionTarget target(*context);
+    target.addLegalOp<UnrealizedConversionCastOp>();
+
+    mlir::RewritePatternSet patterns(context);
+    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                         target);
+    (void)mlir::applyPartialConversion(op, target, std::move(patterns));
+  }
+
+  { // propagate the layout attribute to RankedTensorType by checking
+    // BuiltInUnrealizedCastOps
+    // for VectorType to RankedTensorType cast.
+    op->walk([&](UnrealizedConversionCastOp castOp) {
+      if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
+        return WalkResult::skip();
+
+      Value input = castOp.getInputs()[0];
+      Value result = castOp.getResults()[0];
+      auto inputTy = dyn_cast<VectorType>(input.getType());
+      auto resultTy = dyn_cast<RankedTensorType>(result.getType());
+
+      // Only look at ops casting from VectorType to RankedTensorType
+      if (!isa<VectorType>(inputTy) || !isa<RankedTensorType>(resultTy))
+        return WalkResult::skip();
+
+      xegpu::LayoutAttr layout = xegpu::getLayoutAttr(input);
+      if (!layout)
+        return WalkResult::skip();
+
+      RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
+      result.setType(newTy);
+
+      // update the arguments if user is a LoopLike op.
+      for (OpOperand &use : result.getUses()) {
+        if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
+          BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
+          arg.setType(newTy);
+        }
+        // whileOp has two regions, the BlockArgument of the after region
+        // is not exposed by LoopLikeOpInterface
+        if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
+          unsigned idx = use.getOperandNumber();
+          BlockArgument arg = whileOp.getAfterArguments()[idx];
+          arg.setType(newTy);
+        }
+      }
+      return WalkResult::advance();
+    });
+
+    // using yieldOp as anchor to update the result type of its ParentOp
+    op->walk([&](scf::YieldOp yieldOp) {
+      Operation *parentOp = yieldOp->getParentOp();
+      for (OpResult r : parentOp->getOpResults()) {
+        unsigned idx = r.getResultNumber();
+        Type resultTy = r.getType();
+        Type yieldTy = yieldOp.getResults()[idx].getType();
+        if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
+          r.setType(yieldTy);
+      }
+    });
+  }
+
+  { // perform the conversion from RankedTensorType to VectorType based on the
+    // LayoutAttr
+
+    auto computeTileShapeAndCount = [&](ArrayRef<int64_t> shape,
+                                        DenseI32ArrayAttr sgDataAttr,
+                                        DenseI32ArrayAttr sgLayoutAttr) {
+      SmallVector<int64_t> tileShape;
+      auto sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
+      if (sgDataAttr)
+        tileShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
+      else
+        tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape);
+      assert(tileShape.size() && "failed to compute tileShape");
+      SmallVector<int64_t> distUnit =
+          computeElementwiseMul(sgLayout, tileShape);
+      int count = computeProduct(shape) / computeProduct(distUnit);
+      return std::make_pair(tileShape, count);
+    };
+
+    TypeConverter converter;
+    converter.addConversion([&](Type type) -> Type { return type; });
+    converter.addConversion(
+        [&](RankedTensorType type,
+            SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+          ArrayRef<int64_t> shape = type.getShape();
+          auto encoding = type.getEncoding();
+          Type elemTy = type.getElementType();
+
+          // init count and subShape to the default value. If the LayoutAttr
+          // is not present, it will return a VectorType with original shape.
+          int count = 1;
+          SmallVector<int64_t> subShape(shape);
+
+          if (auto layout =
+                  llvm::dyn_cast_if_present<xegpu::LayoutAttr>(encoding)) {
+            if (layout.isWgLayout()) {
+              // for WgToSg, the subShape is either from sgData or computed as
+              // shape/sgLayout
+              std::tie(subShape, count) = computeTileShapeAndCount(
+                  shape, layout.getSgData(), layout.getSgLayout());
+            } else if (DenseI32ArrayAttr instData = layout.getInstData()) {
+              // for unrolling, the subShape is determined by inst_data
+              subShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
+              count = computeProduct(shape) / computeProduct(subShape);
+            }
+          }
+          auto newTy = VectorType::get(subShape, elemTy);
+          result.append(count, newTy);
+          return success();
+        });
+
+    converter.addConversion(
+        [&](xegpu::TensorDescType type,
+            SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+          MLIRContext *ctx = type.getContext();
+          Type elemTy = type.getElementType();
+          Attribute encoding = type.getEncoding();
+          ArrayRef<int64_t> shape = type.getShape();
+
+          // init count and newTy to the default value. If the layout attribute
+          // is not present, it will return the original type.
+          int count = 1;
+          Type newTy = type;
+
+          if (xegpu::LayoutAttr layout = type.getLayoutAttr()) {
+            SmallVector<int64_t> subShape, distUnit;
+            if (layout.isWgLayout()) {
+              // for WgToSg, the subShape is either from sgData or computed as
+              // shape/sgLayout
+              std::tie(subShape, count) = computeTileShapeAndCount(
+                  shape, layout.getSgData(), layout.getSgLayout());
+              layout = layout.dropSgLayoutAndData();
+            } else if (DenseI32ArrayAttr instData = layout.getInstData()) {
+              // for unrolling, the subShape is determined by inst_data
+              subShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
+              count = computeProduct(shape) / computeProduct(subShape);
+              layout = layout.dropInstData();
+            }
+            newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding,
+                                               layout);
+          }
+
+          result.append(count, newTy);
+          return success();
+        });
+
+    converter.addSourceMaterialization(materializeCast);
+    converter.addTargetMaterialization(materializeCast);
+
+    mlir::ConversionTarget target(*context);
+    target.addLegalOp<UnrealizedConversionCastOp>();
+
+    mlir::RewritePatternSet patterns(context);
+    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                         target);
+    (void)mlir::applyPartialConversion(op, target, std::move(patterns));
+  }
+}

>From 7b5e8f1193006591062592f5e8858c33113448fe Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 13 May 2025 20:02:45 +0000
Subject: [PATCH 06/11] partial working

---
 .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 16 +++++++++++-----
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 19 ++++++++++---------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
index efc44aadb14e6..737600fe909fa 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -120,18 +120,22 @@ bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const {
 
 void XeGPUInstructionlizePass::runOnOperation() {
   MLIRContext *ctx = &getContext();
+  Operation *op = getOperation();
+
+  // first perform type conversion for SCF control folow ops
+  xegpu::doSCFStructuralTypeConversionWithTensorType(op);
+
   xegpu::UnrollOptions options;
   options.setFilterConstraint([&](Operation *op) -> LogicalResult {
     return needsUnroll(op) ? success() : failure();
   });
 
-  options.setNativeShapeFn(
-      [&](Operation *op) -> std::optional<SmallVector<int64_t>> {
+  options.setNativeShapeFn([&](Operation *op) {
         return getTileShape(op);
       });
 
   options.setUnrolledTypesFn(
-      [&](ShapedType type, ArrayRef<int64_t> tileShape) -> SmallVector<Type> {
+      [&](ShapedType type, ArrayRef<int64_t> tileShape) {
         Type elemTy = type.getElementType();
         Type newTy;
 
@@ -149,8 +153,10 @@ void XeGPUInstructionlizePass::runOnOperation() {
         return SmallVector<Type>(computeProduct(*ratio), newTy);
       });
 
-  RewritePatternSet patterns(ctx);
+  GreedyRewriteConfig config;
+  config.setStrictness(GreedyRewriteStrictness::ExistingOps);
 
+  RewritePatternSet patterns(ctx);
   populateXeGPUUnrollPatterns(patterns, options);
-  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
 }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index e43aac4ce8dc0..cb2c4d40f8a6d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -215,8 +215,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
     // LayoutAttr
 
     auto computeTileShapeAndCount = [&](ArrayRef<int64_t> shape,
-                                        DenseI32ArrayAttr sgDataAttr,
-                                        DenseI32ArrayAttr sgLayoutAttr) {
+                                          DenseI32ArrayAttr sgDataAttr,
+                                          DenseI32ArrayAttr sgLayoutAttr) {
       SmallVector<int64_t> tileShape;
       auto sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
       if (sgDataAttr)
@@ -224,8 +224,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
       else
         tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape);
       assert(tileShape.size() && "failed to compute tileShape");
-      SmallVector<int64_t> distUnit =
-          computeElementwiseMul(sgLayout, tileShape);
+      SmallVector<int64_t> distUnit = computeElementwiseMul(sgLayout, tileShape);
       int count = computeProduct(shape) / computeProduct(distUnit);
       return std::make_pair(tileShape, count);
     };
@@ -249,8 +248,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
             if (layout.isWgLayout()) {
               // for WgToSg, the subShape is either from sgData or computed as
               // shape/sgLayout
-              std::tie(subShape, count) = computeTileShapeAndCount(
-                  shape, layout.getSgData(), layout.getSgLayout());
+              std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout());
             } else if (DenseI32ArrayAttr instData = layout.getInstData()) {
               // for unrolling, the subShape is determined by inst_data
               subShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
@@ -280,8 +278,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
             if (layout.isWgLayout()) {
               // for WgToSg, the subShape is either from sgData or computed as
               // shape/sgLayout
-              std::tie(subShape, count) = computeTileShapeAndCount(
-                  shape, layout.getSgData(), layout.getSgLayout());
+              std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout());
               layout = layout.dropSgLayoutAndData();
             } else if (DenseI32ArrayAttr instData = layout.getInstData()) {
               // for unrolling, the subShape is determined by inst_data
@@ -298,7 +295,11 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
         });
 
     converter.addSourceMaterialization(materializeCast);
-    converter.addTargetMaterialization(materializeCast);
+    converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
+                                        ValueRange inputs, Location loc) {
+      return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+          .getResults();
+    });
 
     mlir::ConversionTarget target(*context);
     target.addLegalOp<UnrealizedConversionCastOp>();

>From e2eb9e63df30e9e84d3d09060ec493bc2b805f3d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 15 May 2025 21:22:16 +0000
Subject: [PATCH 07/11] refactor pack and unpack

---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |  39 ++++-
 .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 163 +++++++++++++-----
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  |  25 +--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 152 +++++++++++++++-
 4 files changed, 301 insertions(+), 78 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 4bcda3e3ac95f..b41da0ea6a276 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -15,6 +15,8 @@ namespace mlir {
 class VectorType;
 class OpOperand;
 class OpResult;
+class OpBuilder;
+class ValueRange;
 
 namespace xegpu {
 class LayoutAttr;
@@ -53,17 +55,46 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
 FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
                                                LayoutAttr layout);
 
+/// Return the attribute name for the OpOperand to attach LayoutAttr
+std::string getLayoutName(OpOperand &opr);
+
+/// Return the attribute name for the OpResult to attach LayoutAttr
+std::string getLayoutName(OpResult res);
+
 /// Retrieves the LayoutAttr associated with a given Value. For TensorDescType
 /// values, the LayoutAttr is extracted from the TensorDescType itself. For
 /// other values, it is obtained from the attributes of the defining operation.
 /// Returns nullptr if no LayoutAttr is found.
 LayoutAttr getLayoutAttr(Value value);
 
-/// Retrieves the name for the LayoutAttr associated with a given OpOperand.
-std::string getLayoutName(OpOperand &opr);
+/// Retrieves the LayoutAttr associated with a given OpOperand. It will
+/// first check the operand_layout_{id} of the owner operation. If not found,
+/// it will check the operand itself and its defining op.
+LayoutAttr getLayoutAttr(OpOperand &opr);
 
-/// Retrieves the name for the LayoutAttr associated with a given OpResult.
-std::string getLayoutName(OpResult res);
+/// Sets the LayoutAttr for a given OpOperand by attaching it to the owner
+void setLayoutAttr(OpOperand &opr, LayoutAttr layout);
+
+/// Set the LayoutAttr for the given OpResult by attching it to the defining op
+void setLayoutAttr(OpResult result, LayoutAttr layout);
+
+/// Set the LayoutAttr for each OpOperand and OpResult of the given operation.
+/// If the operation contains regions, it is also applied recursively to the
+/// contained operations
+void setLayoutAttrs(Operation *op,
+                    function_ref<LayoutAttr(Value)> getLayoutImpl);
+
+/// Extract a set of small vectors from a value with a given shape using
+/// vector.extract_stride_slice
+SmallVector<Value> extractVectorsWithShapeFromValue(OpBuilder &builder,
+                                                    Location loc, Value value,
+                                                    ArrayRef<int64_t> shape);
+
+/// Create a vector of shape from a set of values using
+/// vector.insert_stride_slice.
+Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
+                                      ValueRange values,
+                                      ArrayRef<int64_t> shape);
 
 /// Do type conversion for SCF structural ops, e.g., scf.for. Since VectorType
 /// cannot carry the layout attribute, they are converted into RankedTensorType
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
index 737600fe909fa..0e01c7e4d9763 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -45,6 +46,10 @@ class XeGPUInstructionlizePass final
   std::optional<SmallVector<int64_t>>
   getTileShape(TypedValue<ShapedType> value) const;
 
+  std::optional<SmallVector<int64_t>> getTileShape(OpOperand &operand) const;
+
+  std::optional<SmallVector<int64_t>> getTileShape(OpResult result) const;
+
   // Get the tile shape for a given operation.
   std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;
 
@@ -67,20 +72,46 @@ XeGPUInstructionlizePass::getTileShape(TypedValue<ShapedType> value) const {
   return std::nullopt;
 }
 
+std::optional<SmallVector<int64_t>>
+XeGPUInstructionlizePass::getTileShape(OpOperand &operand) const {
+  xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand);
+  if (layout && layout.isSgLayout()) {
+    if (auto inst_data = layout.getInstData())
+      return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+
+    if (auto type = dyn_cast<ShapedType>(operand.get().getType()))
+      return llvm::to_vector(type.getShape());
+  }
+  return std::nullopt;
+}
+
+std::optional<SmallVector<int64_t>>
+XeGPUInstructionlizePass::getTileShape(OpResult result) const {
+  xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result);
+  if (layout && layout.isSgLayout()) {
+    if (auto inst_data = layout.getInstData())
+      return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+
+    if (auto type = dyn_cast<ShapedType>(result.getType()))
+      return llvm::to_vector(type.getShape());
+  }
+  return std::nullopt;
+}
+
 std::optional<SmallVector<int64_t>>
 XeGPUInstructionlizePass::getTileShape(Operation *op) const {
   if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(op))
-    return getTileShape(cast<TypedValue<ShapedType>>(op->getResult(0)));
+    return getTileShape(op->getOpResult(0));
   if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp>(op))
-    return getTileShape(cast<TypedValue<ShapedType>>(op->getOperand(0)));
+    return getTileShape(op->getOpOperand(0));
   if (isa<xegpu::StoreNdOp>(op))
-    return getTileShape(cast<TypedValue<ShapedType>>(op->getOperand(1)));
+    return getTileShape(op->getOpOperand(1));
 
   if (isa<xegpu::DpasOp>(op)) {
-    auto a = cast<TypedValue<ShapedType>>(op->getOperand(0));
-    auto b = cast<TypedValue<ShapedType>>(op->getOperand(1));
-    std::optional<SmallVector<int64_t>> aTile = getTileShape(a);
-    std::optional<SmallVector<int64_t>> bTile = getTileShape(b);
+    std::optional<SmallVector<int64_t>> aTile =
+        getTileShape(op->getOpOperand(0));
+    std::optional<SmallVector<int64_t>> bTile =
+        getTileShape(op->getOpOperand(1));
 
     if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
       return std::nullopt;
@@ -91,8 +122,8 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const {
 
     // semantic check for C
     if (op->getNumOperands() == 3) {
-      auto c = cast<TypedValue<ShapedType>>(op->getOperand(2));
-      std::optional<SmallVector<int64_t>> cTile = getTileShape(c);
+      std::optional<SmallVector<int64_t>> cTile =
+          getTileShape(op->getOpOperand(2));
       int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
       if (!cTile || !llvm::equal(*cTile, expectedCTile))
         return std::nullopt;
@@ -104,59 +135,101 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const {
 }
 
 bool XeGPUInstructionlizePass::needsUnroll(Operation *op) const {
-  for (Value opr : op->getOperands()) {
-    if (auto value = dyn_cast<TypedValue<ShapedType>>(opr)) {
-      std::optional<SmallVector<int64_t>> tileShape = getTileShape(value);
-      // the tile should have the same rank as the origial type
-      if (!tileShape ||
-          tileShape->size() != static_cast<size_t>(value.getType().getRank()))
-        return false;
-      if (!llvm::equal(*tileShape, value.getType().getShape()))
-        return true;
-    }
+  if (isa<LoopLikeOpInterface>(op))
+    return false;
+
+  for (auto &opr : op->getOpOperands()) {
+    std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
+    auto shapedType = dyn_cast<ShapedType>(opr.get().getType());
+    if (!shapedType)
+      continue;
+
+    if (tileShape && !llvm::equal(*tileShape, shapedType.getShape()))
+      return true;
+  }
+
+  for (auto result : op->getOpResults()) {
+    std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
+    auto shapedType = dyn_cast<ShapedType>(result.getType());
+    if (!shapedType)
+      continue;
+
+    if (tileShape && !llvm::equal(*tileShape, shapedType.getShape()))
+      return true;
   }
   return false;
 }
 
 void XeGPUInstructionlizePass::runOnOperation() {
   MLIRContext *ctx = &getContext();
-  Operation *op = getOperation();
+  Operation *mod = getOperation();
+
+  // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr.
+  // This ensures that the LayoutAttr remains accessible even if the defining
+  // operation is replaced.
+  xegpu::setLayoutAttrs(mod, [&](Value v) { return xegpu::getLayoutAttr(v); });
 
-  // first perform type conversion for SCF control folow ops
-  xegpu::doSCFStructuralTypeConversionWithTensorType(op);
+  // Perform type conversion for SCF control folow ops
+  xegpu::doSCFStructuralTypeConversionWithTensorType(mod);
 
   xegpu::UnrollOptions options;
   options.setFilterConstraint([&](Operation *op) -> LogicalResult {
     return needsUnroll(op) ? success() : failure();
   });
 
-  options.setNativeShapeFn([&](Operation *op) {
-        return getTileShape(op);
-      });
+  options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });
 
-  options.setUnrolledTypesFn(
-      [&](ShapedType type, ArrayRef<int64_t> tileShape) {
-        Type elemTy = type.getElementType();
-        Type newTy;
+  options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape) {
+    Type elemTy = type.getElementType();
+    Type newTy;
 
-        if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type))
-          newTy = xegpu::TensorDescType::get(
-              ctx, tileShape, elemTy, tdescTy.getEncoding(),
-              tdescTy.getLayoutAttr().dropInstData());
-        else
-          newTy = type.clone(tileShape, elemTy);
+    if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type))
+      newTy = xegpu::TensorDescType::get(
+          ctx, tileShape, elemTy, tdescTy.getEncoding(),
+          tdescTy.getLayoutAttr().dropInstData());
+    else
+      newTy = type.clone(tileShape, elemTy);
 
-        std::optional<SmallVector<int64_t>> ratio =
-            computeShapeRatio(type.getShape(), tileShape);
-        assert(ratio &&
-               "The shape of the type must be a multiple of tileShape.");
-        return SmallVector<Type>(computeProduct(*ratio), newTy);
-      });
-
-  GreedyRewriteConfig config;
-  config.setStrictness(GreedyRewriteStrictness::ExistingOps);
+    std::optional<SmallVector<int64_t>> ratio =
+        computeShapeRatio(type.getShape(), tileShape);
+    assert(ratio && "The shape of the type must be a multiple of tileShape.");
+    return SmallVector<Type>(computeProduct(*ratio), newTy);
+  });
 
   RewritePatternSet patterns(ctx);
   populateXeGPUUnrollPatterns(patterns, options);
-  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+  (void)applyPatternsGreedily(mod, std::move(patterns));
+
+  mod->walk([&](UnrealizedConversionCastOp castOp) {
+    ValueRange inputs = castOp.getInputs();
+    ValueRange outputs = castOp.getOutputs();
+
+    if (inputs.size() == 1 && outputs.size() == 1) {
+      castOp->replaceAllUsesWith(inputs);
+      castOp->erase();
+    }
+
+    VectorType inputTy = dyn_cast<VectorType>(inputs[0].getType());
+    VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
+    if (inputTy && outputTy) {
+      OpBuilder builder(castOp);
+      // unpack
+      if (inputs.size() > 1 && outputs.size() == 1) {
+        ArrayRef<int64_t> shape = outputTy.getShape();
+        Value result = xegpu::createVectorWithShapeFromValues(
+            builder, castOp.getLoc(), inputs, shape);
+        castOp->replaceAllUsesWith(ValueRange(result));
+        castOp->erase();
+      }
+
+      // pack
+      if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
+        ArrayRef<int64_t> tileShape = outputTy.getShape();
+        SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
+            builder, castOp.getLoc(), inputs[0], tileShape);
+        castOp->replaceAllUsesWith(results);
+        castOp->erase();
+      }
+    }
+  });
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 44d45dd2eaec0..d9f69158f95eb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
@@ -74,17 +75,7 @@ struct UnrollPattern : public OpRewritePattern<SourceOp> {
       assert(vecTy.getRank() == static_cast<int64_t>(blockSize.size()) &&
              "Expecting blockSize size to match the rank of destTy.");
       auto shape = vecTy.getShape();
-      auto zeroAttr = rewriter.getZeroAttr(vecTy.getElementType());
-
-      Value result = rewriter.create<arith::ConstantOp>(
-          loc, vecTy, DenseElementsAttr::get(vecTy, zeroAttr));
-      for (auto [src, offsets] :
-           llvm::zip_equal(srcs, StaticTileOffsetRange(shape, blockSize))) {
-        SmallVector<int64_t> staticStrides(offsets.size(), 1);
-        result = rewriter.create<vector::InsertStridedSliceOp>(
-            loc, src, result, offsets, staticStrides);
-      }
-      return result;
+      return xegpu::createVectorWithShapeFromValues(rewriter, loc, srcs, shape);
     }
 
     if (isa<xegpu::TensorDescType>(destTy)) {
@@ -109,16 +100,8 @@ struct UnrollPattern : public OpRewritePattern<SourceOp> {
     if (auto vecTy = dyn_cast<VectorType>(src.getType())) {
       assert(vecTy.getRank() == static_cast<int64_t>(blockSize.size()) &&
              "Expecting blockSize size to match the rank of src.");
-      auto shape = vecTy.getShape();
-      SmallVector<Value> results;
-      for (SmallVector<int64_t> offsets :
-           StaticTileOffsetRange(shape, blockSize)) {
-        SmallVector<int64_t> staticStrides(offsets.size(), 1);
-        auto slice = rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, src, offsets, blockSize, staticStrides);
-        results.push_back(slice);
-      }
-      return results;
+      return xegpu::extractVectorsWithShapeFromValue(rewriter, loc, src,
+                                                     blockSize);
     }
 
     if (isa<xegpu::TensorDescType>(src.getType())) {
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index cb2c4d40f8a6d..60c8493f552d8 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -14,15 +14,26 @@
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
 
 using namespace mlir;
 
+/// convert ArrayRef<ValueRange> into SmallVector<Value>
+static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
+  SmallVector<Value> result;
+  for (const auto &vals : values)
+    llvm::append_range(result, vals);
+  return result;
+}
+
 FailureOr<VectorType>
 mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
   auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
@@ -90,6 +101,16 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType,
   return xegpu::getDistributedVectorType(helperTdescTy);
 }
 
+std::string xegpu::getLayoutName(OpOperand &opr) {
+  const StringRef prefix("layout_operand_");
+  return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str();
+}
+
+std::string xegpu::getLayoutName(OpResult res) {
+  const StringRef prefix = "layout_result_";
+  return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str();
+}
+
 xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
   if (!value)
     return nullptr;
@@ -121,14 +142,86 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
   return nullptr;
 }
 
-std::string xegpu::getLayoutName(OpOperand &opr) {
-  const StringRef prefix("layout_operand_");
-  return llvm::formatv("{0}{1}", prefix, opr.getOperandNumber()).str();
+xegpu::LayoutAttr xegpu::getLayoutAttr(OpOperand &opr) {
+  Operation *op = opr.getOwner();
+  std::string layoutName = xegpu::getLayoutName(opr);
+  if (op->hasAttr(layoutName))
+    return op->getAttrOfType<xegpu::LayoutAttr>(layoutName);
+  return getLayoutAttr(opr.get());
 }
 
-std::string xegpu::getLayoutName(OpResult res) {
-  const StringRef prefix = "layout_result_";
-  return llvm::formatv("{0}{1}", prefix, res.getResultNumber()).str();
+void xegpu::setLayoutAttr(OpOperand &opr, LayoutAttr layout) {
+  auto owner = opr.getOwner();
+  std::string name = xegpu::getLayoutName(opr);
+  if (layout && !owner->hasAttrOfType<LayoutAttr>(name))
+    owner->setAttr(name, layout);
+}
+
+void xegpu::setLayoutAttr(OpResult result, LayoutAttr layout) {
+  Operation *owner = result.getOwner();
+  std::string name = xegpu::getLayoutName(result);
+  if (layout && !owner->hasAttr(name))
+    owner->setAttr(name, layout);
+}
+
+void xegpu::setLayoutAttrs(Operation *mod,
+                           function_ref<LayoutAttr(Value)> getLayoutImpl) {
+  mod->walk([&](Operation *op) {
+    for (OpResult result : op->getOpResults()) {
+      auto layout = getLayoutImpl(result);
+      setLayoutAttr(result, layout);
+    }
+    for (OpOperand &opr : op->getOpOperands()) {
+      auto layout = getLayoutImpl(opr.get());
+      setLayoutAttr(opr, layout);
+    }
+  });
+}
+
+SmallVector<Value>
+xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
+                                        Value value, ArrayRef<int64_t> shape) {
+  auto vecTy = dyn_cast<VectorType>(value.getType());
+  if (!vecTy)
+    return {value};
+
+  ArrayRef<int64_t> srcShape = vecTy.getShape();
+  if (!computeShapeRatio(srcShape, shape))
+    return {value};
+
+  SmallVector<Value> result;
+  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(srcShape, shape)) {
+    SmallVector<int64_t> staticStrides(offsets.size(), 1);
+    result.push_back(builder.create<vector::ExtractStridedSliceOp>(
+        loc, value, offsets, shape, staticStrides));
+  }
+
+  return result;
+}
+
+Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
+                                             ValueRange values,
+                                             ArrayRef<int64_t> shape) {
+  VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
+  assert(llvm::all_of(values.getTypes(),
+                      [&](Type type) { return type == inputTy; }) &&
+         "values must be of the same VectorType");
+
+  Type elemTy = inputTy.getElementType();
+  ArrayRef<int64_t> tileShape = inputTy.getShape();
+
+  VectorType resultTy = VectorType::get(shape, elemTy);
+  auto zeroAttr = builder.getZeroAttr(elemTy);
+  Value result = builder.create<arith::ConstantOp>(
+      loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));
+
+  for (auto [src, offsets] :
+       llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
+    SmallVector<int64_t> staticStrides(offsets.size(), 1);
+    result = builder.create<vector::InsertStridedSliceOp>(
+        loc, src, result, offsets, staticStrides);
+  }
+  return result;
 }
 
 void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
@@ -213,7 +306,6 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
 
   { // perform the conversion from RankedTensorType to VectorType based on the
     // LayoutAttr
-
     auto computeTileShapeAndCount = [&](ArrayRef<int64_t> shape,
                                           DenseI32ArrayAttr sgDataAttr,
                                           DenseI32ArrayAttr sgLayoutAttr) {
@@ -302,9 +394,53 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
     });
 
     mlir::ConversionTarget target(*context);
-    target.addLegalOp<UnrealizedConversionCastOp>();
+    target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+        [&](UnrealizedConversionCastOp op) {
+          auto isTensorTy = [&](Type type) {
+            return isa<RankedTensorType>(type);
+          };
+          if (llvm::any_of(op->getOperandTypes(), isTensorTy) ||
+              llvm::any_of(op->getResultTypes(), isTensorTy))
+            return false;
+          return true;
+        });
+
+    class UnrealizedConversionCastOpPattern
+        : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
+      using OpConversionPattern<
+          mlir::UnrealizedConversionCastOp>::OpConversionPattern;
+
+      mlir::LogicalResult
+      matchAndRewrite(mlir::UnrealizedConversionCastOp op,
+                      OneToNOpAdaptor adaptor,
+                      ConversionPatternRewriter &rewriter) const override {
+        auto inputs = op.getOperands();
+        auto outputs = op.getOutputs();
+
+        if (inputs.size() != 1 || outputs.size() != 1)
+          return failure();
+
+        auto inputTy = inputs[0].getType();
+        auto outputTy = outputs[0].getType();
+
+        if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
+          rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
+          return success();
+        }
+
+        if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
+          SmallVector<Value> values = flattenValues(adaptor.getInputs());
+          auto newOp = rewriter.create<UnrealizedConversionCastOp>(
+              op.getLoc(), outputTy, values);
+          rewriter.replaceOp(op, newOp);
+          return success();
+        }
+        return failure();
+      }
+    };
 
     mlir::RewritePatternSet patterns(context);
+    patterns.insert<UnrealizedConversionCastOpPattern>(context);
     scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                          target);
     (void)mlir::applyPartialConversion(op, target, std::move(patterns));

>From 6ec3604310f3abf10d576162b14e0820839056e5 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 15 May 2025 23:42:54 +0000
Subject: [PATCH 08/11] cleanup layout attr

---
 .../XeGPU/Transforms/XeGPUInstructionlize.cpp | 72 ++++++++++++-------
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  6 +-
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
index 0e01c7e4d9763..fba0f882ef632 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -32,6 +32,39 @@ using namespace mlir;
 
 namespace {
 
+void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
+  ValueRange inputs = castOp.getInputs();
+  ValueRange outputs = castOp.getOutputs();
+
+  if (inputs.size() == 1 && outputs.size() == 1) {
+    castOp->replaceAllUsesWith(inputs);
+    castOp->erase();
+  }
+
+  VectorType inputTy = dyn_cast<VectorType>(inputs[0].getType());
+  VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
+  if (inputTy && outputTy) {
+    OpBuilder builder(castOp);
+    // unpack
+    if (inputs.size() > 1 && outputs.size() == 1) {
+      ArrayRef<int64_t> shape = outputTy.getShape();
+      Value result = xegpu::createVectorWithShapeFromValues(
+          builder, castOp.getLoc(), inputs, shape);
+      castOp->replaceAllUsesWith(ValueRange(result));
+      castOp->erase();
+    }
+
+    // pack
+    if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
+      ArrayRef<int64_t> tileShape = outputTy.getShape();
+      SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
+          builder, castOp.getLoc(), inputs[0], tileShape);
+      castOp->replaceAllUsesWith(results);
+      castOp->erase();
+    }
+  }
+}
+
 /// Unroll XeGPU ops to their instruction-level representation.
 class XeGPUInstructionlizePass final
     : public xegpu::impl::XeGPUInstructionlizeBase<XeGPUInstructionlizePass> {
@@ -200,35 +233,22 @@ void XeGPUInstructionlizePass::runOnOperation() {
   populateXeGPUUnrollPatterns(patterns, options);
   (void)applyPatternsGreedily(mod, std::move(patterns));
 
-  mod->walk([&](UnrealizedConversionCastOp castOp) {
-    ValueRange inputs = castOp.getInputs();
-    ValueRange outputs = castOp.getOutputs();
+  mod->walk([&](Operation *op) {
+    if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
+      resolveUnrealizedConversionCastOp(castOp);
 
-    if (inputs.size() == 1 && outputs.size() == 1) {
-      castOp->replaceAllUsesWith(inputs);
-      castOp->erase();
+    for (OpOperand &opr : op->getOpOperands()) {
+      std::string name = xegpu::getLayoutName(opr);
+      if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name))
+        op->removeAttr(name);
     }
 
-    VectorType inputTy = dyn_cast<VectorType>(inputs[0].getType());
-    VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
-    if (inputTy && outputTy) {
-      OpBuilder builder(castOp);
-      // unpack
-      if (inputs.size() > 1 && outputs.size() == 1) {
-        ArrayRef<int64_t> shape = outputTy.getShape();
-        Value result = xegpu::createVectorWithShapeFromValues(
-            builder, castOp.getLoc(), inputs, shape);
-        castOp->replaceAllUsesWith(ValueRange(result));
-        castOp->erase();
-      }
-
-      // pack
-      if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
-        ArrayRef<int64_t> tileShape = outputTy.getShape();
-        SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
-            builder, castOp.getLoc(), inputs[0], tileShape);
-        castOp->replaceAllUsesWith(results);
-        castOp->erase();
+    for (OpResult result : op->getOpResults()) {
+      std::string name = xegpu::getLayoutName(result);
+      if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
+        op->removeAttr(name);
+        if (!isa<LoopLikeOpInterface>(op))
+          xegpu::setLayoutAttr(result, layout.dropInstData());
       }
     }
   });
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 60c8493f552d8..023e445206440 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -115,7 +115,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(Value value) {
   if (!value)
     return nullptr;
 
-  if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(value.getType()))
+  if (auto tdescTy =
+          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
     return tdescTy.getLayoutAttr();
 
   if (auto result = dyn_cast<OpResult>(value)) {
@@ -366,7 +367,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
           Type newTy = type;
 
           if (xegpu::LayoutAttr layout = type.getLayoutAttr()) {
-            SmallVector<int64_t> subShape, distUnit;
+            SmallVector<int64_t> subShape(shape);
             if (layout.isWgLayout()) {
               // for WgToSg, the subShape is either from sgData or computed as
               // shape/sgLayout
@@ -378,6 +379,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
               count = computeProduct(shape) / computeProduct(subShape);
               layout = layout.dropInstData();
             }
+
             newTy = xegpu::TensorDescType::get(ctx, subShape, elemTy, encoding,
                                                layout);
           }

>From bc69a8de7e0d436a7718fc2b30ee4bbd7861e5a4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 16 May 2025 14:10:26 +0000
Subject: [PATCH 09/11] check in elemwise support

---
 .../Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
index fba0f882ef632..078b674de8d4f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -164,6 +164,10 @@ XeGPUInstructionlizePass::getTileShape(Operation *op) const {
 
     return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
   }
+
+  if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
+    return getTileShape(op->getOpResult(0));
+
   return std::nullopt;
 }
 
@@ -230,7 +234,14 @@ void XeGPUInstructionlizePass::runOnOperation() {
   });
 
   RewritePatternSet patterns(ctx);
+
+  vector::UnrollVectorOptions vectorOptions;
+  // vectorOptions.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });
+  vectorOptions.setNativeShapeFn(options.nativeShape);
+
   populateXeGPUUnrollPatterns(patterns, options);
+  vector::populateVectorUnrollPatterns(patterns, vectorOptions);
+
   (void)applyPatternsGreedily(mod, std::move(patterns));
 
   mod->walk([&](Operation *op) {

>From 4fc75402332a5062eaa20b51f20ef54b4e5281ac Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 16 May 2025 14:43:59 +0000
Subject: [PATCH 10/11] check in unit test

---
 .../Dialect/XeGPU/xegpu-instructionlize.mlir  | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir

diff --git a/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir b/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir
new file mode 100644
index 0000000000000..888684789cc8c
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-instructionlize.mlir
@@ -0,0 +1,123 @@
+// RUN: mlir-opt --xegpu-instructionlize -split-input-file %s | FileCheck %s
+
+
+#a = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
+#b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
+#c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
+
+#l1 = #xegpu.layout<inst_data = [8, 16]>
+#l2 = #xegpu.layout<inst_data = [16, 16]>
+
+gpu.module @test_kernel {
+  gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+    %c0 = arith.constant 0 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c16 : index
+    %n = arith.muli %block_id_y, %c32 : index
+
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
+    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
+      -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
+      //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+      //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+      //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
+      //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c
+        : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
+    }
+    //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+    xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+    gpu.return
+  }
+
+  //-----
+  gpu.func @test_gemm_simple(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+    %c0 = arith.constant 0 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c16 : index
+    %n = arith.muli %block_id_y, %c32 : index
+
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1>
+    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32>
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2>
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
+      -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) {
+      //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16>
+      //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16>
+      //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      %c = xegpu.dpas %a, %b, %arg2 {layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1>
+      //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c
+        : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>
+    }
+    //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+    xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1>
+    gpu.return
+  }
+
+  //-----
+
+  gpu.func @test_gemm_a_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+    %c0 = arith.constant 0 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c16 : index
+    %n = arith.muli %block_id_y, %c32 : index
+
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c>
+    %c_init = xegpu.load_nd %c_tdesc : !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32>
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init)
+      -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) {
+      //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16>
+      //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+      //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16>
+      %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16>
+      //CHECK-COUNT-8: xegpu.dpas {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      %c = xegpu.dpas %e, %b, %arg2 {layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32>
+      //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a>
+      //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [16, 1]>>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c
+        : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>
+    }
+    //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [8, 1]>>
+    xegpu.store_nd %out#2, %c_tdesc: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c>
+    gpu.return
+  }}

>From 132f15e7400b92b61801ca0bf013be66a95c54d1 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 16 May 2025 15:06:25 +0000
Subject: [PATCH 11/11] fix format

---
 .../XeGPU/Transforms/XeGPUInstructionlize.cpp     |  1 -
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp       | 15 +++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
index 078b674de8d4f..f0ebe2321f8f1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUInstructionlize.cpp
@@ -236,7 +236,6 @@ void XeGPUInstructionlizePass::runOnOperation() {
   RewritePatternSet patterns(ctx);
 
   vector::UnrollVectorOptions vectorOptions;
-  // vectorOptions.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });
   vectorOptions.setNativeShapeFn(options.nativeShape);
 
   populateXeGPUUnrollPatterns(patterns, options);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 023e445206440..14b2b909e143a 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -308,8 +308,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
   { // perform the conversion from RankedTensorType to VectorType based on the
     // LayoutAttr
     auto computeTileShapeAndCount = [&](ArrayRef<int64_t> shape,
-                                          DenseI32ArrayAttr sgDataAttr,
-                                          DenseI32ArrayAttr sgLayoutAttr) {
+                                        DenseI32ArrayAttr sgDataAttr,
+                                        DenseI32ArrayAttr sgLayoutAttr) {
       SmallVector<int64_t> tileShape;
       auto sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
       if (sgDataAttr)
@@ -317,7 +317,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
       else
         tileShape = computeShapeRatio(shape, sgLayout).value_or(tileShape);
       assert(tileShape.size() && "failed to compute tileShape");
-      SmallVector<int64_t> distUnit = computeElementwiseMul(sgLayout, tileShape);
+      SmallVector<int64_t> distUnit =
+          computeElementwiseMul(sgLayout, tileShape);
       int count = computeProduct(shape) / computeProduct(distUnit);
       return std::make_pair(tileShape, count);
     };
@@ -341,7 +342,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
             if (layout.isWgLayout()) {
               // for WgToSg, the subShape is either from sgData or computed as
               // shape/sgLayout
-              std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout());
+              std::tie(subShape, count) = computeTileShapeAndCount(
+                  shape, layout.getSgData(), layout.getSgLayout());
             } else if (DenseI32ArrayAttr instData = layout.getInstData()) {
               // for unrolling, the subShape is determined by inst_data
               subShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
@@ -371,7 +373,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
             if (layout.isWgLayout()) {
               // for WgToSg, the subShape is either from sgData or computed as
               // shape/sgLayout
-              std::tie(subShape, count) = computeTileShapeAndCount(shape, layout.getSgData(), layout.getSgLayout());
+              std::tie(subShape, count) = computeTileShapeAndCount(
+                  shape, layout.getSgData(), layout.getSgLayout());
               layout = layout.dropSgLayoutAndData();
             } else if (DenseI32ArrayAttr instData = layout.getInstData()) {
               // for unrolling, the subShape is determined by inst_data
@@ -390,7 +393,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(Operation *op) {
 
     converter.addSourceMaterialization(materializeCast);
     converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
-                                        ValueRange inputs, Location loc) {
+                                           ValueRange inputs, Location loc) {
       return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
           .getResults();
     });