[Mlir-commits] [mlir] [MLIR][XeGPU] TensorDesc Type support generic DistributeLayout instead of Layout (PR #190401)

Fri Apr 3 13:43:24 PDT 2026

https://github.com/Jianhui-Li created https://github.com/llvm/llvm-project/pull/190401

This PR allows TensorDesc to support slice layout, not just plain layout. 

>From 3a6c2fe41fa7953ca42e94e5663231b33052ce00 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 2 Apr 2026 22:42:50 +0000
Subject: [PATCH 1/7] initial implementation

---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |   5 +-
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 473 ++++++++++++++++--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  77 ++-
 3 files changed, 499 insertions(+), 56 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 9cf9a8705209b..5f46eab7b74c7 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -183,10 +183,13 @@ setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
                 VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
                 const uArch::uArch *uArch);
 
+DistributeLayoutAttr
+inferSourceLayoutFromResult(OpOperand &operand, DistributeLayoutAttr resLayout);
+
 /// Gets the expected layout for a given consumer operand. This will check if
 /// the owning operation of the consumer operand is one of the special layout
 /// users and determine the expected layout accordingly.
-xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
+DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
 
 } // namespace xegpu
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 55cd6ec04970c..06cd0eaa0059e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -18,16 +18,22 @@
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
 
+#define DEBUG_TYPE "xegpu-layout-recovery"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
 using namespace mlir;
 
 void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
@@ -80,32 +86,330 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
   return out;
 }
 
-// Attach layout attributes to all vector-type operands of operations within
-// the given operation's region. Reports an error if any vector operand lacks
-// a layout attribute.
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
-  auto result = rootOp->walk([&](Operation *op) {
-    for (OpOperand &operand : op->getOpOperands()) {
-      // Layouts are needed for vector type only.
-      if (!isa<VectorType>(operand.get().getType()))
-        continue;
-      // Skip block arguments since they don't have defining ops to attach
-      // layout attributes to.
-      if (isa<BlockArgument>(operand.get()))
+// Prerequisite for Layout Recovery
+// It relies on the following invariant:
+// 1. there is no layout conflict between different uses of the same definition.
+// 2. each definition has a well-defined layout requirement at its use point.
+//     - Every definition must have at least one use that appears after it in
+//     topological order.
+//     - If a definition has no such use (e.g., a loop result or region output),
+//     an explicit convert_layout operation is inserted to create a use.
+//     - Only the result of convert_layout is permitted to have no subsequent
+//     use.
+
+// The recovery proceeds by scanning the operation in reverse topological order
+// as follows:
+//    For regular operations: First the result layouts are propagated from uses.
+//      Then the result layouts are propagated to operands.
+//
+//    For region operations (e.g., loops):
+//       - When backward propagation reaches a region op, it sets the layout of
+//       the region op’s results according to use points like regular ops.
+//       - Then, the result layouts (such as a loop output) are propagated to
+//       their corresponding operands in the yield.
+//       - When backward propagation reaches the first operation inside the
+//       region, the pass examines the region op’s initialization list,
+//       propagating from region arguments to the corresponding initialization
+//       operands.
+//       - This ensures that layouts are consistently propagated
+//       across region boundaries while preserving a single well-defined use for
+//       each definition at the region-op level.
+
+// the inner function for recoverTemporaryLayouts is a recursive function
+// the input rootOp is the function operation, which is also a region op.
+// it recursivley process the region op in reverse topological order.
+
+static void walkRegionBackward(Region &region,
+                               llvm::function_ref<void(Operation *)> visit) {
+  // blocks: back -> front
+  for (Block &block : llvm::reverse(region)) {
+    // ops: back -> front, early-inc so visit() may erase current op safely
+    for (Operation &op : llvm::reverse(block)) {
+      // make sure we first visit inside the region op (so yield op first)
+      // and then move to region op itself
+      for (Region &nested : llvm::reverse(op.getRegions()))
+        walkRegionBackward(nested, visit);
+
+      visit(&op);
+    }
+  }
+}
+
+static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
+  xegpu::DistributeLayoutAttr layout = nullptr;
+  for (OpOperand &use : result.getUses()) {
+    if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
+      // debug print the use and op, and the tmpLayout
+      LLVM_DEBUG({
+        DBGS() << "      use: " << use.getOwner()->getName() << use.getOwner();
+        llvm::dbgs() << ", tmpLayout=" << tmpLayout << "\n";
+      });
+      // under debug mode, we want to check all the use points to make sure
+      // there is no conflict, so we do not break here. In release mode, we can
+      // break at the first use
+#ifndef NDEBUG
+      assert(!layout || layout == tmpLayout);
+      layout = tmpLayout;
+#else
+      layout = tmpLayout;
+      break;
+#endif
+    }
+  }
+  return layout;
+}
+
+// For regular operations: First the result layouts are propagated from uses.
+// Then the result layouts are propagated to uses (operands).
+static void propagateResultsToRegularOperands(Operation *op) {
+  LLVM_DEBUG(DBGS() << "propagateResultsToRegularOperands: " << op->getName()
+                    << " (" << op->getNumOperands() << " operands, "
+                    << op->getNumResults() << " results)\n");
+
+  if (op->getNumResults() == 0) {
+    LLVM_DEBUG(DBGS() << "  skipping (no results)\n");
+    return;
+  }
+
+  Value result = op->getResult(0);
+  xegpu::DistributeLayoutAttr resLayout =
+      getLayoutFromUsePoints(op->getResult(0));
+  Type resultType = result.getType();
+
+  // recover layout for tensor Descriptor type, which is a special case since
+  // its layout is not stored as an attribute but encoded in the type itself.
+  // For vector type, we attach the layout as an attribute to op.
+  if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+    auto typeWithLayout = xegpu::TensorDescType::get(
+        tensorDescTy.getContext(), tensorDescTy.getShape(),
+        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
+    result.setType(typeWithLayout);
+  }
+
+  for (OpOperand &opr : op->getOpOperands()) {
+    // Layouts are needed for vector type only.
+    xegpu::DistributeLayoutAttr operandLayout =
+        xegpu::inferSourceLayoutFromResult(opr, resLayout);
+    if (!isa<VectorType>(opr.get().getType())) {
+      LLVM_DEBUG(DBGS() << "  operand #" << opr.getOperandNumber()
+                        << ": skipped (non-vector type: " << opr.get().getType()
+                        << ")\n");
+      continue;
+    }
+
+    xegpu::setTemporaryLayout(opr, operandLayout);
+    // debug print op
+    LLVM_DEBUG(DBGS() << "after propagateResultsToRegularOperands  op: "
+                      << op->getName() << op << "  operand #"
+                      << opr.getOperandNumber()
+                      << ": type=" << opr.get().getType());
+    llvm::dbgs() << ", temp Layout=" << xegpu::getTemporaryLayout(opr);
+    llvm::dbgs() << "\n";
+  }
+}
+
+static void propagateRegionResultsToYieldOperands(
+    mlir::RegionBranchTerminatorOpInterface yieldOp) {
+  LLVM_DEBUG(DBGS() << "propagateRegionResultsToYieldOperands: "
+                    << yieldOp->getName() << " (" << yieldOp->getNumOperands()
+                    << " operands), parent="
+                    << yieldOp->getParentOp()->getName() << "\n");
+
+  if (func::FuncOp func = dyn_cast<func::FuncOp>(yieldOp->getParentOp())) {
+    LLVM_DEBUG(DBGS() << "  skipping (parent is FuncOp)\n");
+    return;
+  }
+  llvm::SmallVector<mlir::RegionSuccessor> successors;
+  llvm::SmallVector<mlir::Attribute> operands(yieldOp->getNumOperands(),
+                                              nullptr);
+  yieldOp.getSuccessorRegions(operands, successors);
+
+  auto regionBranchOp = cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+
+  LLVM_DEBUG(DBGS() << "  found " << successors.size() << " successors\n");
+  for (mlir::RegionSuccessor &successor : successors) {
+    // debug print out successorr
+    LLVM_DEBUG({
+      DBGS() << "  successor: ";
+      if (successor.isParent()) {
+        DBGS() << "(parent operation)";
+      } else {
+        DBGS() << "region with " << successor.getSuccessor()->getNumArguments()
+               << " arguments";
+      }
+      DBGS() << "\n";
+    });
+    // find out the successor which is the parent region of yieldOp
+    // if (successor.getSuccessor() != yieldOp->getParentRegion()) {
+    //   LLVM_DEBUG(DBGS() << "  skipping successor (not parent region)\n");
+    //   continue;
+    // }
+    if (!successor.isParent())
+      continue;
+    // propagate the layout from region result to yield operands
+    ValueRange successorInputs = regionBranchOp.getSuccessorInputs(successor);
+    LLVM_DEBUG(DBGS() << "  propagating " << successorInputs.size()
+                      << " region results to yield operands\n");
+    for (unsigned i = 0; i < successorInputs.size(); ++i) {
+      Value regionResult = successorInputs[i];
+
+      // debug print regionResult
+      LLVM_DEBUG({
+        DBGS() << " before propagateRegionResultsToYieldOperands, Region IR:";
+        DBGS() << "    region result #" << i
+               << ": type=" << regionResult.getType();
+        llvm::dbgs() << regionResult;
+        llvm::dbgs() << "\n";
+      });
+      // find all the use of region result, and propagate the layout to the
+      // corresponding yield operand for all use of region result, get its
+      // layout from temporary operand layout if any of these use have it
+      xegpu::DistributeLayoutAttr layout = getLayoutFromUsePoints(regionResult);
+
+      // auto layout = xegpu::getDistributeLayoutAttr(regionResult);
+      if (layout == nullptr) {
+        LLVM_DEBUG(DBGS() << "    region result #" << i
+                          << ": skipped (no layout)\n");
         continue;
-      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
-      if (!layout) {
-        op->emitWarning("Could not find layout attribute for operand ")
-            << operand.getOperandNumber() << " of operation " << op->getName();
+      }
+      assert(
+          layout &&
+          "region result layout must be defined before propagating to yield");
+
+      if (auto opResult = dyn_cast<OpResult>(regionResult))
+        xegpu::setTemporaryLayout(opResult, layout);
+      xegpu::setTemporaryLayout(yieldOp->getOpOperand(i), layout);
+
+      LLVM_DEBUG({
+        DBGS() << " after propagateRegionResultsToYieldOperands, Region IR:";
+        regionResult.print(llvm::dbgs());
+        if (Operation *defOp = regionResult.getDefiningOp())
+          defOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+        llvm::dbgs() << "\n";
+      });
+    }
+  }
+}
+
+static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
+  LLVM_DEBUG(DBGS() << "propagateRegionArgsToInits: " << regionOp->getName()
+                    << " (" << regionOp->getNumOperands() << " operands, "
+                    << regionOp->getNumRegions() << " regions)\n");
+  DBGS() << " before propagateRegionArgsToInits, Region IR:";
+  regionOp.print(llvm::dbgs());
+  DBGS() << " complex debug Region IR:";
+  regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+  // Get entry successors (regions that can be entered initially)
+  SmallVector<RegionSuccessor> successors;
+  regionOp.getEntrySuccessorRegions(/*operands=*/ArrayRef<Attribute>(),
+                                    successors);
+
+  LLVM_DEBUG(DBGS() << "  found " << successors.size()
+                    << " entry successors\n");
+  // For each possible entry region, get the operands forwarded to it
+  for (RegionSuccessor &successor : successors) {
+    OperandRange initOperands = regionOp.getEntrySuccessorOperands(successor);
+    unsigned beginIdx = initOperands.getBeginOperandIndex();
+    unsigned numArgs = successor.getSuccessor()->getNumArguments();
+    LLVM_DEBUG(DBGS() << "  successor region: " << numArgs
+                      << " args, initOperands beginIdx=" << beginIdx
+                      << ", count=" << initOperands.size() << "\n");
+    // initOperands are the initialization arguments for this successor
+    // iterate the region arguments
+    for (unsigned i = 0; i < numArgs; ++i) {
+      Value regionArg =
+          successor.getSuccessor()->getArgument(i); // region argument
+      auto layout = xegpu::getDistributeLayoutAttr(regionArg);
+      if (layout == nullptr) {
+        LLVM_DEBUG(DBGS() << "    region argument #" << i
+                          << ": skipped (no layout)\n");
         continue;
       }
-      xegpu::setTemporaryLayout(operand, layout);
+      assert(
+          layout &&
+          "region argument layout must be defined before propagating to init");
+      LLVM_DEBUG(DBGS() << "    regionArg #" << i << ": type="
+                        << regionArg.getType() << ", layout=" << layout
+                        << " -> init operand #" << (beginIdx + i) << "\n");
+      xegpu::setTemporaryLayout(regionOp->getOpOperand(beginIdx + i), layout);
     }
-    return WalkResult::advance();
+  }
+  DBGS() << " after propagateRegionArgsToInits, Region IR:";
+  regionOp.print(llvm::dbgs());
+}
+
+bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+  LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts START ===\n");
+
+  auto processFunc = [&](Region &body, StringRef funcName) {
+    LLVM_DEBUG(DBGS() << "Processing func: " << funcName << "\n");
+    walkRegionBackward(body, [&](Operation *op) {
+      LLVM_DEBUG(DBGS() << "Visiting op: " << op->getName());
+      if (op->getNumResults() > 0) {
+        LLVM_DEBUG(llvm::dbgs() << " [results: " << op->getNumResults());
+        for (OpResult res : op->getResults()) {
+          auto layout = xegpu::getDistributeLayoutAttr(res);
+          LLVM_DEBUG(llvm::dbgs() << " r#" << res.getResultNumber() << "="
+                                  << (layout ? layout : nullptr));
+        }
+        LLVM_DEBUG(llvm::dbgs() << "]");
+      }
+      LLVM_DEBUG(llvm::dbgs() << "\n");
+      if (auto regionOp = dyn_cast<mlir::RegionBranchOpInterface>(op)) {
+        // hit the region op after visiting inside region
+        LLVM_DEBUG(DBGS() << "  -> dispatching as RegionBranchOp\n");
+        propagateRegionArgsToInits(regionOp);
+      } else if (auto yieldOp =
+                     dyn_cast<mlir::RegionBranchTerminatorOpInterface>(op)) {
+        // yield op inside region op
+        LLVM_DEBUG(DBGS() << "  -> dispatching as YieldOp\n");
+        propagateRegionResultsToYieldOperands(yieldOp);
+      } else if (!dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
+        // if the op is regular op, calling propagateResultsToRegularOperands
+        LLVM_DEBUG(DBGS() << "  -> dispatching as regular op\n");
+        propagateResultsToRegularOperands(op);
+      }
+    });
+  };
+
+  rootOp->walk([&](func::FuncOp func) {
+    processFunc(func.getBody(), func.getSymName());
   });
-  return !result.wasInterrupted();
+  rootOp->walk([&](gpu::GPUFuncOp func) {
+    processFunc(func.getBody(), func.getName());
+  });
+
+  LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts END ===\n");
+  return true;
 }
 
+// // Attach layout attributes to all vector-type operands of operations within
+// // the given operation's region. Reports an error if any vector operand lacks
+// // a layout attribute.
+// bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+//   auto result = rootOp->walk([&](Operation *op) {
+//     for (OpOperand &operand : op->getOpOperands()) {
+//       // Layouts are needed for vector type only.
+//       if (!isa<VectorType>(operand.get().getType()))
+//         continue;
+//       // Skip block arguments since they don't have defining ops to attach
+//       // layout attributes to.
+//       if (isa<BlockArgument>(operand.get()))
+//         continue;
+//       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+//       if (!layout) {
+//         op->emitWarning("Could not find layout attribute for operand ")
+//             << operand.getOperandNumber() << " of operation " <<
+//             op->getName();
+//         xegpu::setTemporaryLayout(operand, layout);
+//         continue;
+//       }
+//     }
+//     return WalkResult::advance();
+//   });
+//   return !result.wasInterrupted();
+// }
+
 template <typename T, typename>
 void xegpu::removeLayoutAttr(const T &operandOrResult) {
   Operation *owner = operandOrResult.getOwner();
@@ -1108,99 +1412,178 @@ xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
   return std::nullopt;
 }
 
-xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
+xegpu::DistributeLayoutAttr
+xegpu::inferSourceLayoutFromResult(OpOperand &operand,
+                                   xegpu::DistributeLayoutAttr resLayout) {
   Operation *op = operand.getOwner();
   unsigned idx = operand.getOperandNumber();
-  xegpu::DistributeLayoutAttr resLayout;
-  if (op->getNumResults() == 1)
-    resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
 
   // For vector::BroadcastOp, infer the source layout from the result layout.
   if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG(DBGS() << "  -> BroadcastOp\n");
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
+    }
     auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
-    if (!srcTy)
+    if (!srcTy) {
+      LLVM_DEBUG(DBGS() << "     source is not VectorType, returning null\n");
       return xegpu::DistributeLayoutAttr();
-    return xegpu::inferBroadcastSourceLayout(
+    }
+    auto inferred = xegpu::inferBroadcastSourceLayout(
         resLayout, broadcast.getResultVectorType().getShape(),
         srcTy.getShape());
+    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
+    return inferred;
   }
 
   // For vector::MultiDimReductionOp, infer source layout from result layout
   // using reduction dims. Acc operand is expected to have the same layout as
   // the result.
   if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG(DBGS() << "  -> MultiDimReductionOp, operand idx=" << idx
+                      << "\n");
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
+    }
     if (idx == 0) {
       SmallVector<int64_t> reductionDims(reduction.getReductionDims());
-      return xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
+      LLVM_DEBUG({
+        DBGS() << "     reductionDims=[";
+        llvm::interleaveComma(reductionDims, llvm::dbgs());
+        llvm::dbgs() << "]\n";
+      });
+      auto inferred =
+          xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
+      LLVM_DEBUG(DBGS() << "     inferred source layout=" << inferred << "\n");
+      return inferred;
     }
-    if (idx == 1)
+    if (idx == 1) {
+      LLVM_DEBUG(DBGS() << "     acc operand, using resLayout\n");
       return resLayout;
+    }
   }
 
   if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG(DBGS() << "  -> ReductionOp\n");
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
-    return xegpu::inferReductionSourceLayout(resLayout);
+    }
+    auto inferred = xegpu::inferReductionSourceLayout(resLayout);
+    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
+    return inferred;
   }
 
   // For vector::BitCastOp, infer source layout from result layout using
   // element type bitwidths.
   if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG(DBGS() << "  -> BitCastOp\n");
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
+    }
     int resElemBitWidth =
         bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
     int srcElemBitWidth =
         bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
-    return xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
-                                           srcElemBitWidth);
+    LLVM_DEBUG(DBGS() << "     resBitWidth=" << resElemBitWidth
+                      << ", srcBitWidth=" << srcElemBitWidth << "\n");
+    auto inferred = xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
+                                                    srcElemBitWidth);
+    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
+    return inferred;
   }
 
   // For vector::ShapeCastOp, infer source layout from result layout using
   // shapes.
   if (auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG({
+      DBGS() << "  -> ShapeCastOp: resShape=[";
+      llvm::interleaveComma(shapeCast.getResultVectorType().getShape(),
+                            llvm::dbgs());
+      llvm::dbgs() << "], srcShape=[";
+      llvm::interleaveComma(shapeCast.getSourceVectorType().getShape(),
+                            llvm::dbgs());
+      llvm::dbgs() << "]\n";
+    });
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
-    return xegpu::inferShapeCastSourceLayout(
+    }
+    auto inferred = xegpu::inferShapeCastSourceLayout(
         resLayout, shapeCast.getResultVectorType().getShape(),
         shapeCast.getSourceVectorType().getShape());
+    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
+    return inferred;
   }
 
   // For vector::InsertStridedSliceOp, infer source layout from result layout.
   // Dest vector must have the same layout as the result.
   if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG(DBGS() << "  -> InsertStridedSliceOp, operand idx=" << idx
+                      << "\n");
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
-    if (idx == 0)
-      return xegpu::inferInsertStridedSliceSourceLayout(
+    }
+    if (idx == 0) {
+      auto inferred = xegpu::inferInsertStridedSliceSourceLayout(
           resLayout, insertSlice.getDestVectorType().getShape(),
           insertSlice.getSourceVectorType().getShape());
-    if (idx == 1)
+      LLVM_DEBUG(DBGS() << "     inferred source layout=" << inferred << "\n");
+      return inferred;
+    }
+    if (idx == 1) {
+      LLVM_DEBUG(DBGS() << "     dest operand, using resLayout\n");
       return resLayout;
+    }
   }
 
   // For vector::TransposeOp, infer source layout from result layout using
   // permutation.
   if (auto transpose = dyn_cast<vector::TransposeOp>(op)) {
-    if (!resLayout)
+    LLVM_DEBUG({
+      DBGS() << "  -> TransposeOp, perm=[";
+      llvm::interleaveComma(transpose.getPermutation(), llvm::dbgs());
+      llvm::dbgs() << "]\n";
+    });
+    if (!resLayout) {
+      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
       return xegpu::DistributeLayoutAttr();
-    return xegpu::inferTransposeSourceLayout(resLayout,
-                                             transpose.getPermutation());
+    }
+    auto inferred = xegpu::inferTransposeSourceLayout(
+        resLayout, transpose.getPermutation());
+    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
+    return inferred;
   }
 
   // For elementwise operations, all operands must have the same layout as the
   // result.
   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
+    LLVM_DEBUG(DBGS() << "  -> elementwise op, using resLayout="
+                      << (resLayout ? resLayout : nullptr) << "\n");
     if (!resLayout)
       return xegpu::DistributeLayoutAttr();
     return resLayout;
   }
-  // TODO: Handle more cases as needed here.
+  return xegpu::DistributeLayoutAttr();
+}
+
+xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
+  Operation *op = operand.getOwner();
+  xegpu::DistributeLayoutAttr resLayout;
+  if (op->getNumResults() == 1)
+    resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+  auto inferredOperandLayout = inferSourceLayoutFromResult(operand, resLayout);
+  if (inferredOperandLayout)
+    return inferredOperandLayout;
   // By default, assume no layout conflict and return the current layout of
   // the operand.
-  return xegpu::getDistributeLayoutAttr(operand.get());
+  auto fallback = xegpu::getDistributeLayoutAttr(operand);
+  LLVM_DEBUG(DBGS() << "  -> fallback (unhandled op " << op->getName()
+                    << "), returning operand layout="
+                    << (fallback ? fallback : nullptr) << "\n");
+  return fallback;
 }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 243581b4ce522..a762458105e47 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -23,10 +23,14 @@
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
 
+#define DEBUG_TYPE "xegpu-utils"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
 using namespace mlir;
 
 /// convert ArrayRef<ValueRange> into SmallVector<Value>
@@ -145,19 +149,31 @@ std::string xegpu::getTemporaryLayoutName(const OpResult result) {
 }
 
 xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
-  if (!value)
+  LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(Value): type="
+                    << value.getType() << "\n");
+  if (!value) {
+    LLVM_DEBUG(DBGS() << "  -> null value, returning nullptr\n");
     return nullptr;
+  }
 
   if (auto tdescTy =
-          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
-    return tdescTy.getLayoutAttr();
+          dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
+    auto layout = tdescTy.getLayoutAttr();
+    LLVM_DEBUG(DBGS() << "  -> TensorDescType, layout="
+                      << (layout ? layout : nullptr) << "\n");
+    return layout;
+  }
 
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
+    LLVM_DEBUG(DBGS() << "  OpResult #" << result.getResultNumber() << " from "
+                      << defOp->getName() << "\n");
 
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
       auto layout = anchorOp.getAnchorLayout();
+      LLVM_DEBUG(DBGS() << "  -> AnchorLayoutInterface, layout="
+                        << (layout ? layout : nullptr) << "\n");
       return layout;
     }
 
@@ -165,59 +181,100 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     if (defOp->hasAttr(layoutName)) {
       auto layout =
           defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+      LLVM_DEBUG(DBGS() << "  -> temporary attr '" << layoutName
+                        << "', layout=" << layout << "\n");
       return layout;
     }
+    LLVM_DEBUG(DBGS() << "  -> OpResult: no layout found (checked '"
+                      << layoutName << "')\n");
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto *parentOp = arg.getOwner()->getParentOp();
+    LLVM_DEBUG(DBGS() << "  BlockArgument #" << arg.getArgNumber() << " of "
+                      << (parentOp ? parentOp->getName().getStringRef()
+                                   : StringRef("(null)"))
+                      << "\n");
     if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
-      if (tiedInit)
+      if (tiedInit) {
+        LLVM_DEBUG(DBGS() << "  -> LoopLikeOp, recursing into tiedInit "
+                          << "operand #" << tiedInit->getOperandNumber()
+                          << "\n");
         return getDistributeLayoutAttr(tiedInit->get());
+      }
+      LLVM_DEBUG(DBGS() << "  -> LoopLikeOp, no tiedInit\n");
     }
   }
 
+  LLVM_DEBUG(DBGS() << "  -> returning nullptr\n");
   return nullptr;
 }
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
   unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
+  LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(OpOperand): operand #" << idx
+                    << " of " << op->getName()
+                    << ", type=" << opr.get().getType() << "\n");
 
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
       if (idx == 0) {
-        return dpasOp.getLayoutAAttr();
+        auto layout = dpasOp.getLayoutAAttr();
+        LLVM_DEBUG(DBGS() << "  -> DpasOp layoutA="
+                          << (layout ? layout : nullptr) << "\n");
+        return layout;
       } else if (idx == 1) {
-        return dpasOp.getLayoutBAttr();
+        auto layout = dpasOp.getLayoutBAttr();
+        LLVM_DEBUG(DBGS() << "  -> DpasOp layoutB="
+                          << (layout ? layout : nullptr) << "\n");
+        return layout;
       } else if (idx == 2) {
-        return dpasOp.getLayoutCdAttr();
+        auto layout = dpasOp.getLayoutCdAttr();
+        LLVM_DEBUG(DBGS() << "  -> DpasOp layoutCd="
+                          << (layout ? layout : nullptr) << "\n");
+        return layout;
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
-      return convertOp.getInputLayoutAttr();
+      auto layout = convertOp.getInputLayoutAttr();
+      LLVM_DEBUG(DBGS() << "  -> ConvertLayoutOp inputLayout="
+                        << (layout ? layout : nullptr) << "\n");
+      return layout;
     }
     auto layout = anchorOp.getAnchorLayout();
 
-    if (idx == 0)
+    if (idx == 0) {
+      LLVM_DEBUG(DBGS() << "  -> AnchorLayoutInterface idx=0, layout="
+                        << (layout ? layout : nullptr) << "\n");
       return layout;
+    }
 
     // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
     // the layout is valid for the first two operands: value and memref/tdesc.
     // For other operations, the layout applies to the first operand only.
     if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
             op) &&
-        (idx < 2))
+        (idx < 2)) {
+      LLVM_DEBUG(DBGS() << "  -> Store op idx=" << idx
+                        << ", layout=" << (layout ? layout : nullptr) << "\n");
       return layout;
+    }
+    LLVM_DEBUG(DBGS() << "  -> AnchorLayoutInterface idx=" << idx
+                      << " not covered, falling through\n");
   }
 
   std::string layoutName = xegpu::getTemporaryLayoutName(opr);
   if (op->hasAttr(layoutName)) {
     auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+    LLVM_DEBUG(DBGS() << "  -> temporary attr '" << layoutName
+                      << "', layout=" << layout << "\n");
     return layout;
   }
 
+  LLVM_DEBUG(DBGS() << "  -> returning nullptr (checked '" << layoutName
+                    << "')\n");
   return nullptr;
 }
 

>From f77e110d9dc81257b2deeb9cccb20e10bea3739b Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 05:31:42 +0000
Subject: [PATCH 2/7] pass while

---
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 253 +++++++-----------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |   2 +-
 2 files changed, 103 insertions(+), 152 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 06cd0eaa0059e..47148870eeaae 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -147,13 +147,8 @@ static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
       // under debug mode, we want to check all the use points to make sure
       // there is no conflict, so we do not break here. In release mode, we can
       // break at the first use
-#ifndef NDEBUG
-      assert(!layout || layout == tmpLayout);
-      layout = tmpLayout;
-#else
-      layout = tmpLayout;
-      break;
-#endif
+      if (!layout)
+        layout = tmpLayout;
     }
   }
   return layout;
@@ -215,127 +210,118 @@ static void propagateRegionResultsToYieldOperands(
                     << " operands), parent="
                     << yieldOp->getParentOp()->getName() << "\n");
 
-  if (func::FuncOp func = dyn_cast<func::FuncOp>(yieldOp->getParentOp())) {
+  if (isa<func::FuncOp>(yieldOp->getParentOp())) {
     LLVM_DEBUG(DBGS() << "  skipping (parent is FuncOp)\n");
     return;
   }
-  llvm::SmallVector<mlir::RegionSuccessor> successors;
-  llvm::SmallVector<mlir::Attribute> operands(yieldOp->getNumOperands(),
-                                              nullptr);
-  yieldOp.getSuccessorRegions(operands, successors);
 
-  auto regionBranchOp = cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+  auto regionBranchOp =
+      dyn_cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+  if (!regionBranchOp) {
+    LLVM_DEBUG(DBGS() << "  skipping (parent is not RegionBranchOp)\n");
+    return;
+  }
 
-  LLVM_DEBUG(DBGS() << "  found " << successors.size() << " successors\n");
-  for (mlir::RegionSuccessor &successor : successors) {
-    // debug print out successorr
-    LLVM_DEBUG({
-      DBGS() << "  successor: ";
-      if (successor.isParent()) {
-        DBGS() << "(parent operation)";
-      } else {
-        DBGS() << "region with " << successor.getSuccessor()->getNumArguments()
-               << " arguments";
-      }
-      DBGS() << "\n";
-    });
-    // find out the successor which is the parent region of yieldOp
-    // if (successor.getSuccessor() != yieldOp->getParentRegion()) {
-    //   LLVM_DEBUG(DBGS() << "  skipping successor (not parent region)\n");
-    //   continue;
-    // }
-    if (!successor.isParent())
-      continue;
-    // propagate the layout from region result to yield operands
-    ValueRange successorInputs = regionBranchOp.getSuccessorInputs(successor);
-    LLVM_DEBUG(DBGS() << "  propagating " << successorInputs.size()
-                      << " region results to yield operands\n");
-    for (unsigned i = 0; i < successorInputs.size(); ++i) {
-      Value regionResult = successorInputs[i];
-
-      // debug print regionResult
-      LLVM_DEBUG({
-        DBGS() << " before propagateRegionResultsToYieldOperands, Region IR:";
-        DBGS() << "    region result #" << i
-               << ": type=" << regionResult.getType();
-        llvm::dbgs() << regionResult;
-        llvm::dbgs() << "\n";
-      });
-      // find all the use of region result, and propagate the layout to the
-      // corresponding yield operand for all use of region result, get its
-      // layout from temporary operand layout if any of these use have it
-      xegpu::DistributeLayoutAttr layout = getLayoutFromUsePoints(regionResult);
-
-      // auto layout = xegpu::getDistributeLayoutAttr(regionResult);
-      if (layout == nullptr) {
-        LLVM_DEBUG(DBGS() << "    region result #" << i
-                          << ": skipped (no layout)\n");
-        continue;
-      }
-      assert(
-          layout &&
-          "region result layout must be defined before propagating to yield");
+  // Gather layouts for each result of the parent region op from external
+  // use points.
+  unsigned numResults = regionBranchOp->getNumResults();
+  LLVM_DEBUG(DBGS() << "  parent op has " << numResults << " results\n");
+
+  SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
+  for (unsigned i = 0; i < numResults; ++i) {
+    OpResult result = regionBranchOp->getResult(i);
+    resultLayouts[i] = getLayoutFromUsePoints(result);
+    if (resultLayouts[i]) {
+      LLVM_DEBUG(DBGS() << "  result #" << i << ": type=" << result.getType()
+                        << ", layout=" << resultLayouts[i] << "\n");
+      xegpu::setTemporaryLayout(result, resultLayouts[i]);
+    } else {
+      LLVM_DEBUG(DBGS() << "  result #" << i
+                        << ": skipped (no layout from use points)\n");
+    }
+  }
 
-      if (auto opResult = dyn_cast<OpResult>(regionResult))
-        xegpu::setTemporaryLayout(opResult, layout);
-      xegpu::setTemporaryLayout(yieldOp->getOpOperand(i), layout);
+  // Use getSuccessorOperands to find which operands of the terminator
+  // flow to a successor. This handles index offsets automatically (e.g.,
+  // scf.condition's predicate at operand #0 is excluded).
+  // Pick the first successor to determine the operand range.
+  SmallVector<RegionSuccessor> successors;
+  SmallVector<Attribute> operandAttrs(yieldOp->getNumOperands(), nullptr);
+  yieldOp.getSuccessorRegions(operandAttrs, successors);
+  assert(!successors.empty() && "terminator must have at least one successor");
 
-      LLVM_DEBUG({
-        DBGS() << " after propagateRegionResultsToYieldOperands, Region IR:";
-        regionResult.print(llvm::dbgs());
-        if (Operation *defOp = regionResult.getDefiningOp())
-          defOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
-        llvm::dbgs() << "\n";
-      });
-    }
+  OperandRange succOps = yieldOp.getSuccessorOperands(successors.front());
+  unsigned beginIdx = succOps.getBeginOperandIndex();
+  unsigned count = std::min(static_cast<unsigned>(succOps.size()), numResults);
+
+  LLVM_DEBUG(DBGS() << "  " << count << " successor operands starting at index "
+                    << beginIdx << "\n");
+
+  for (unsigned i = 0; i < count; ++i) {
+    if (!resultLayouts[i])
+      continue;
+    LLVM_DEBUG(DBGS() << "    -> setting layout on operand #" << (beginIdx + i)
+                      << "\n");
+    xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i),
+                              resultLayouts[i]);
   }
+
+  LLVM_DEBUG({
+    DBGS() << " after propagateRegionResultsToYieldOperands:\n";
+    yieldOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+    llvm::dbgs() << "\n";
+  });
 }
 
 static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
   LLVM_DEBUG(DBGS() << "propagateRegionArgsToInits: " << regionOp->getName()
                     << " (" << regionOp->getNumOperands() << " operands, "
                     << regionOp->getNumRegions() << " regions)\n");
-  DBGS() << " before propagateRegionArgsToInits, Region IR:";
-  regionOp.print(llvm::dbgs());
-  DBGS() << " complex debug Region IR:";
-  regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
-  // Get entry successors (regions that can be entered initially)
-  SmallVector<RegionSuccessor> successors;
-  regionOp.getEntrySuccessorRegions(/*operands=*/ArrayRef<Attribute>(),
-                                    successors);
-
-  LLVM_DEBUG(DBGS() << "  found " << successors.size()
-                    << " entry successors\n");
-  // For each possible entry region, get the operands forwarded to it
-  for (RegionSuccessor &successor : successors) {
-    OperandRange initOperands = regionOp.getEntrySuccessorOperands(successor);
-    unsigned beginIdx = initOperands.getBeginOperandIndex();
-    unsigned numArgs = successor.getSuccessor()->getNumArguments();
-    LLVM_DEBUG(DBGS() << "  successor region: " << numArgs
-                      << " args, initOperands beginIdx=" << beginIdx
-                      << ", count=" << initOperands.size() << "\n");
-    // initOperands are the initialization arguments for this successor
-    // iterate the region arguments
-    for (unsigned i = 0; i < numArgs; ++i) {
-      Value regionArg =
-          successor.getSuccessor()->getArgument(i); // region argument
-      auto layout = xegpu::getDistributeLayoutAttr(regionArg);
-      if (layout == nullptr) {
-        LLVM_DEBUG(DBGS() << "    region argument #" << i
-                          << ": skipped (no layout)\n");
+  LLVM_DEBUG({
+    DBGS() << " before propagateRegionArgsToInits, Region IR:\n";
+    regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+    llvm::dbgs() << "\n";
+  });
+
+  // Iterate all regions of the region op. For each block argument that has a
+  // layout (determined from its use points), trace back to find the
+  // corresponding init operand of the regionOp and set the layout on it.
+  // This works generically for scf.for, scf.while, and other
+  // RegionBranchOpInterface ops.
+  for (Region &region : regionOp->getRegions()) {
+    RegionSuccessor regionSuccessor(&region);
+    for (auto [argIdx, regionArg] : llvm::enumerate(region.getArguments())) {
+      auto layout = getLayoutFromUsePoints(regionArg);
+      if (!layout) {
+        LLVM_DEBUG(DBGS() << "  region #" << region.getRegionNumber()
+                          << " arg #" << argIdx << ": skipped (no layout)\n");
         continue;
       }
-      assert(
-          layout &&
-          "region argument layout must be defined before propagating to init");
-      LLVM_DEBUG(DBGS() << "    regionArg #" << i << ": type="
-                        << regionArg.getType() << ", layout=" << layout
-                        << " -> init operand #" << (beginIdx + i) << "\n");
-      xegpu::setTemporaryLayout(regionOp->getOpOperand(beginIdx + i), layout);
+      LLVM_DEBUG(DBGS() << "  region #" << region.getRegionNumber() << " arg #"
+                        << argIdx << ": type=" << regionArg.getType()
+                        << ", layout=" << layout << "\n");
+
+      // Find all predecessor values that flow into this block argument.
+      SmallVector<Value> predValues;
+      regionOp.getPredecessorValues(regionSuccessor, argIdx, predValues);
+      for (Value predVal : predValues) {
+        // Match predecessor value to an operand of the regionOp.
+        for (OpOperand &operand : regionOp->getOpOperands()) {
+          if (operand.get() == predVal) {
+            LLVM_DEBUG(DBGS() << "    -> setting layout on init operand #"
+                              << operand.getOperandNumber() << "\n");
+            xegpu::setTemporaryLayout(operand, layout);
+          }
+        }
+      }
     }
   }
-  DBGS() << " after propagateRegionArgsToInits, Region IR:";
-  regionOp.print(llvm::dbgs());
+
+  LLVM_DEBUG({
+    DBGS() << " after propagateRegionArgsToInits, Region IR:\n";
+    regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+    llvm::dbgs() << "\n";
+  });
 }
 
 bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
@@ -345,16 +331,6 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
     LLVM_DEBUG(DBGS() << "Processing func: " << funcName << "\n");
     walkRegionBackward(body, [&](Operation *op) {
       LLVM_DEBUG(DBGS() << "Visiting op: " << op->getName());
-      if (op->getNumResults() > 0) {
-        LLVM_DEBUG(llvm::dbgs() << " [results: " << op->getNumResults());
-        for (OpResult res : op->getResults()) {
-          auto layout = xegpu::getDistributeLayoutAttr(res);
-          LLVM_DEBUG(llvm::dbgs() << " r#" << res.getResultNumber() << "="
-                                  << (layout ? layout : nullptr));
-        }
-        LLVM_DEBUG(llvm::dbgs() << "]");
-      }
-      LLVM_DEBUG(llvm::dbgs() << "\n");
       if (auto regionOp = dyn_cast<mlir::RegionBranchOpInterface>(op)) {
         // hit the region op after visiting inside region
         LLVM_DEBUG(DBGS() << "  -> dispatching as RegionBranchOp\n");
@@ -1415,16 +1391,16 @@ xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
 xegpu::DistributeLayoutAttr
 xegpu::inferSourceLayoutFromResult(OpOperand &operand,
                                    xegpu::DistributeLayoutAttr resLayout) {
+  if (!resLayout) {
+    LLVM_DEBUG(DBGS() << "no resLayout, returning null\n");
+    return xegpu::DistributeLayoutAttr();
+  }
   Operation *op = operand.getOwner();
   unsigned idx = operand.getOperandNumber();
 
   // For vector::BroadcastOp, infer the source layout from the result layout.
   if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
     LLVM_DEBUG(DBGS() << "  -> BroadcastOp\n");
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
     if (!srcTy) {
       LLVM_DEBUG(DBGS() << "     source is not VectorType, returning null\n");
@@ -1443,10 +1419,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
   if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
     LLVM_DEBUG(DBGS() << "  -> MultiDimReductionOp, operand idx=" << idx
                       << "\n");
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     if (idx == 0) {
       SmallVector<int64_t> reductionDims(reduction.getReductionDims());
       LLVM_DEBUG({
@@ -1467,10 +1439,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
 
   if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
     LLVM_DEBUG(DBGS() << "  -> ReductionOp\n");
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     auto inferred = xegpu::inferReductionSourceLayout(resLayout);
     LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
     return inferred;
@@ -1480,10 +1448,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
   // element type bitwidths.
   if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
     LLVM_DEBUG(DBGS() << "  -> BitCastOp\n");
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     int resElemBitWidth =
         bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
     int srcElemBitWidth =
@@ -1508,10 +1472,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
                             llvm::dbgs());
       llvm::dbgs() << "]\n";
     });
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     auto inferred = xegpu::inferShapeCastSourceLayout(
         resLayout, shapeCast.getResultVectorType().getShape(),
         shapeCast.getSourceVectorType().getShape());
@@ -1524,10 +1484,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
   if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
     LLVM_DEBUG(DBGS() << "  -> InsertStridedSliceOp, operand idx=" << idx
                       << "\n");
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     if (idx == 0) {
       auto inferred = xegpu::inferInsertStridedSliceSourceLayout(
           resLayout, insertSlice.getDestVectorType().getShape(),
@@ -1549,10 +1505,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
       llvm::interleaveComma(transpose.getPermutation(), llvm::dbgs());
       llvm::dbgs() << "]\n";
     });
-    if (!resLayout) {
-      LLVM_DEBUG(DBGS() << "     no resLayout, returning null\n");
-      return xegpu::DistributeLayoutAttr();
-    }
     auto inferred = xegpu::inferTransposeSourceLayout(
         resLayout, transpose.getPermutation());
     LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
@@ -1564,8 +1516,7 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
     LLVM_DEBUG(DBGS() << "  -> elementwise op, using resLayout="
                       << (resLayout ? resLayout : nullptr) << "\n");
-    if (!resLayout)
-      return xegpu::DistributeLayoutAttr();
+
     return resLayout;
   }
   return xegpu::DistributeLayoutAttr();
@@ -1581,7 +1532,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
     return inferredOperandLayout;
   // By default, assume no layout conflict and return the current layout of
   // the operand.
-  auto fallback = xegpu::getDistributeLayoutAttr(operand);
+  auto fallback = xegpu::getDistributeLayoutAttr(operand.get());
   LLVM_DEBUG(DBGS() << "  -> fallback (unhandled op " << op->getName()
                     << "), returning operand layout="
                     << (fallback ? fallback : nullptr) << "\n");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 4c30dacae8850..f0ff771f4cbc4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1338,7 +1338,7 @@ LogicalResult ResolveLayoutConflicts::run() {
     // as anchor op for the reduction op's layout.
     if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
       for (OpResult result : op->getResults()) {
-        if (result.getType().isIntOrFloat()) {
+        if (result.getType().isIntOrFloat() || result.use_empty()) {
           auto res = assignResultLayout(result);
           if (failed(res)) {
             DBGS() << "Failed to resolve vector consumer for multi-reduction "

>From 27cc56acf41eb3380d3195fca5a9215b4414a413 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 18:00:50 +0000
Subject: [PATCH 3/7] adding support for DistributeLayoutAttr in TensorDesc
 instead of just LayoutAttr

---
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  6 +++---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |  5 +++--
 .../XeGPU/Transforms/XeGPUBlocking.cpp        | 13 ++++++------
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 21 +++++++++++++------
 .../Transforms/XeGPUPeepHoleOptimizer.cpp     | 11 +++++++---
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  9 ++++----
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  2 +-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  4 ++--
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 13 ++++++------
 9 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 7e142b20c0894..b13f5a9f2c9d9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -82,7 +82,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     static-dim-list ::= decimal-literal `x` decimal-literal
     attr-list = (, encoding-attr)? (, layout-attr)?
     enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
-    layout-attr = (, layout `<`sg_layout = value, sg_data = value, inst_data = value, lane_layout = value, lane_data = value, order = value`>`)?
+    layout-attr = DistributeLayoutAttr
     ```
 
     Examples:
@@ -158,8 +158,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::dyn_cast_if_present<T>(getEncoding());
     }
 
-    LayoutAttr getLayoutAttr() const {
-      return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+    DistributeLayoutAttr getLayoutAttr() const {
+      return llvm::dyn_cast_if_present<DistributeLayoutAttr>(getLayout());
     }
 
     xegpu::MemorySpace getMemorySpace() const {
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 0aa2cd45088f3..1b594f17e15ec 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -219,10 +219,11 @@ void setTemporaryLayout(const T &operandOrResult,
 /// Helper function to check if the layout is packed. Layout is packed if it is
 /// 2D and lane_data[0] != 1 (data packed from col dimension).
 /// TODO: Move to target info.
-bool requirePacked(const LayoutAttr layout);
+bool requirePacked(const DistributeLayoutAttr layout);
 
 /// Helper function to check if the layout requires a transpose effect.
-bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch);
+bool requireTranspose(const DistributeLayoutAttr layout,
+                      const uArch::uArch *uArch);
 
 // Check if dst shape is an expansion of src shape by inserting unit dimensions.
 bool matchUnitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 1ee0bc6ad9507..ef6a494b76638 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -270,12 +270,11 @@ void XeGPUBlockingPass::runOnOperation() {
   }
 
   auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
-                                 xegpu::LayoutAttr layout) {
+                                 xegpu::DistributeLayoutAttr layout) {
     int count = 1;
     SmallVector<int64_t> tileShape(shape);
-    if (layout && layout.getInstData()) {
-      DenseI32ArrayAttr instData = layout.getInstData();
-      tileShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
+    if (layout && !layout.getEffectiveInstDataAsInt().empty()) {
+      tileShape = layout.getEffectiveInstDataAsInt();
       count = computeProduct(shape) / computeProduct(tileShape);
     }
     return std::make_pair(tileShape, count);
@@ -308,7 +307,7 @@ void XeGPUBlockingPass::runOnOperation() {
         Type elemTy = type.getElementType();
         ArrayRef<int64_t> shape = type.getShape();
 
-        xegpu::LayoutAttr layout = type.getLayoutAttr();
+        xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
         if (layout && layout.isForWorkgroup())
           return failure();
 
@@ -348,9 +347,9 @@ void XeGPUBlockingPass::runOnOperation() {
 
         if (chunkSize > 1) {
           int64_t blockedChunkSize = chunkSize;
-          auto instData = tdescTy.getLayoutAttr().getInstData();
+          auto instData = tdescTy.getLayoutAttr().getEffectiveInstDataAsInt();
           if (!instData.empty())
-            blockedChunkSize = instData.asArrayRef().back();
+            blockedChunkSize = instData.back();
 
           // To create a new attribute with a different chunk_size:
           auto newEncoding = xegpu::ScatterTensorDescAttr::get(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 47148870eeaae..535239e869af1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -141,7 +141,8 @@ static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
     if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
       // debug print the use and op, and the tmpLayout
       LLVM_DEBUG({
-        DBGS() << "      use: " << use.getOwner()->getName() << use.getOwner();
+        DBGS() << "getLayoutFromUsePoints  use: " << use.getOwner()->getName()
+               << use.getOwner();
         llvm::dbgs() << ", tmpLayout=" << tmpLayout << "\n";
       });
       // under debug mode, we want to check all the use points to make sure
@@ -175,10 +176,16 @@ static void propagateResultsToRegularOperands(Operation *op) {
   // its layout is not stored as an attribute but encoded in the type itself.
   // For vector type, we attach the layout as an attribute to op.
   if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
-    auto typeWithLayout = xegpu::TensorDescType::get(
-        tensorDescTy.getContext(), tensorDescTy.getShape(),
-        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
-    result.setType(typeWithLayout);
+    auto layout = tensorDescTy.getLayoutAttr();
+    // TODO: remove the layout check. The tensorDescType's layout is treated as
+    // temporary layout, which needs to be set by layout recovery.
+    // allow it now to pass some legacy test case
+    if (!layout) {
+      auto typeWithLayout = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
+      result.setType(typeWithLayout);
+    }
   }
 
   for (OpOperand &opr : op->getOpOperands()) {
@@ -226,6 +233,8 @@ static void propagateRegionResultsToYieldOperands(
   // use points.
   unsigned numResults = regionBranchOp->getNumResults();
   LLVM_DEBUG(DBGS() << "  parent op has " << numResults << " results\n");
+  if (numResults == 0)
+    return;
 
   SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
   for (unsigned i = 0; i < numResults; ++i) {
@@ -303,7 +312,7 @@ static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
 
       // Find all predecessor values that flow into this block argument.
       SmallVector<Value> predValues;
-      regionOp.getPredecessorValues(regionSuccessor, argIdx, predValues);
+      regionOp.getPredecessorValues(regionSuccessor, argIdx - 1, predValues);
       for (Value predVal : predValues) {
         // Match predecessor value to an operand of the regionOp.
         for (OpOperand &operand : regionOp->getOpOperands()) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 0ece695aed512..9288ba9a0cb56 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -145,10 +145,15 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
     return tdescType;
 
   SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
+  auto ctx = tdescType.getContext();
+  auto origLayout = tdescType.getLayoutAttr();
+  SmallVector<int32_t> laneLayoutI32(
+      origLayout.getEffectiveLaneLayoutAsInt().begin(),
+      origLayout.getEffectiveLaneLayoutAsInt().end());
   xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
-      tdescType.getContext(), tdescType.getLayoutAttr().getLaneLayout(),
-      DenseI32ArrayAttr::get(tdescType.getContext(), {1, 1}),
-      tdescType.getLayoutAttr().getOrder());
+      ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
+      /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
+      /*order=*/origLayout.getOrder());
   // Array length can not be larger than 1 for transpose case.
   return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
                                     tdescType.getBoundaryCheck(),
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ecdf253d68182..d8ce24ddd5cb0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -256,7 +256,7 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
     auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
     unsigned operandIdx = operand->getOperandNumber();
 
-    xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = descOp.getType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           descOp, "the tensor descriptor lacks layout attribute");
@@ -342,7 +342,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Type> offsetTypes = llvm::map_to_vector(
         offsetsAsValues, [](Value v) { return v.getType(); });
     xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
-    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           storeOp, "the source tensor descriptor lacks layout attribute");
@@ -474,7 +474,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
         offsetsAsValues, [](Value v) { return v.getType(); });
 
     xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
-    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           loadOp, "the source tensor descriptor lacks layout attribute");
@@ -709,7 +709,8 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Type> offsetTypes = llvm::map_to_vector(
         offsetsAsValues, [](Value v) { return v.getType(); });
 
-    xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
+    xegpu::DistributeLayoutAttr layout =
+        prefetchOp.getTensorDescType().getLayoutAttr();
     if (!layout)
       return rewriter.notifyMatchFailure(
           prefetchOp, "the source tensor descriptor lacks layout attribute");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0aead9172858f..e47224bbe755c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1647,7 +1647,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   converter.addConversion(
       [&](xegpu::TensorDescType type,
           SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
-        xegpu::LayoutAttr layout = type.getLayoutAttr();
+        xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
         // Only convert WG-level tensor descs. SG-level or layout-less types
         // are already legal and should pass through unchanged.
         if (!layout || !layout.isForWorkgroup())
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a762458105e47..55cf47e38dfd0 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -936,7 +936,7 @@ template int
 xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
                                    ArrayRef<unsigned> candidateMultiples);
 
-bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
+bool xegpu::requirePacked(const xegpu::DistributeLayoutAttr layout) {
   if (!layout)
     return false;
   auto laneData = layout.getEffectiveLaneDataAsInt();
@@ -945,7 +945,7 @@ bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
   return laneData[0] != 1;
 }
 
-bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
+bool xegpu::requireTranspose(const xegpu::DistributeLayoutAttr layout,
                              const xegpu::uArch::uArch *uArch) {
   // Return false for unsupported targets.
   // TODO: Add more support or move to target info.
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 0d10ab7c74da6..4760016bdcea4 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -106,10 +106,9 @@ struct TestXeGPUUnrollingPatterns
         }
 
         if (auto layout = tdescTy.getLayoutAttr()) {
-          auto inst_data = layout.getInstData();
-          if (inst_data && layout.isForSubgroup())
-            return SmallVector<int64_t>(inst_data.asArrayRef().begin(),
-                                        inst_data.asArrayRef().end());
+          auto inst_data = layout.getEffectiveInstDataAsInt();
+          if (!inst_data.empty() && layout.isForSubgroup())
+            return SmallVector<int64_t>(inst_data.begin(), inst_data.end());
         }
       }
 
@@ -138,9 +137,9 @@ struct TestXeGPUUnrollingPatterns
 
               if (chunkSize > 1) {
                 int64_t blockedChunkSize = chunkSize;
-                auto instData = layout.getInstData();
+                auto instData = layout.getEffectiveInstDataAsInt();
                 if (!instData.empty())
-                  blockedChunkSize = instData.asArrayRef().back();
+                  blockedChunkSize = instData.back();
 
                 // To create a new attribute with a different chunk_size:
                 auto newEncoding = xegpu::ScatterTensorDescAttr::get(
@@ -150,7 +149,7 @@ struct TestXeGPUUnrollingPatterns
               }
             }
             if (layout) {
-              if (layout.getLaneLayout() == nullptr)
+              if (layout.getEffectiveLaneLayoutAsInt().empty())
                 layout = xegpu::LayoutAttr();
               else
                 layout = layout.dropInstData();

>From 0690c6cc01e121b137c1056e62d22ff207a82777 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:02:43 +0000
Subject: [PATCH 4/7] fix bugs

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  2 +-
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      |  6 +++
 .../Transforms/XeGPUPeepHoleOptimizer.cpp     | 19 ++++++++--
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 38 ++++++++++++++++++-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     | 19 +---------
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |  4 +-
 6 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 950371e17255f..64c56b5adf5d7 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -1318,7 +1318,7 @@ mlir::Type TensorDescType::parse(AsmParser &parser) {
     mlir::Attribute attr;
     ParseResult res = parser.parseAttribute(attr);
     if (mlir::succeeded(res)) {
-      if (mlir::isa<LayoutAttr>(attr)) {
+      if (mlir::isa<DistributeLayoutAttr>(attr)) {
         layout = attr;
         continue;
       }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 535239e869af1..33c9086566d3c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -365,6 +365,12 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
   });
 
   LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts END ===\n");
+  // print the root op after
+  LLVM_DEBUG({
+    DBGS() << "After recoverTemporaryLayouts, IR:\n";
+    rootOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+    llvm::dbgs() << "\n";
+  });
   return true;
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 9288ba9a0cb56..c43eaba5b3ee6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -28,6 +28,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include <optional>
 
 namespace mlir {
@@ -147,13 +148,25 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
   SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
   auto ctx = tdescType.getContext();
   auto origLayout = tdescType.getLayoutAttr();
-  SmallVector<int32_t> laneLayoutI32(
-      origLayout.getEffectiveLaneLayoutAsInt().begin(),
-      origLayout.getEffectiveLaneLayoutAsInt().end());
+  auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
+  SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
+                                     laneLayoutI64.end());
+  LLVM_DEBUG({
+    DBGS() << "tryOptimize: origLayout=" << origLayout << "\n";
+    DBGS() << "  laneLayoutI32=[";
+    llvm::interleaveComma(laneLayoutI32, llvm::dbgs());
+    llvm::dbgs() << "], laneData=[1, 1]";
+    if (origLayout.getOrder())
+      llvm::dbgs() << ", order=" << origLayout.getOrder();
+    llvm::dbgs() << "\n";
+    DBGS() << "  supportedShape=[" << supportedHeight << ", " << supportedWidth
+           << "], newElemTy=" << newElemTy << ", arrayLen=" << arrayLen << "\n";
+  });
   xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
       ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
       /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
       /*order=*/origLayout.getOrder());
+  LLVM_DEBUG(DBGS() << "  newLayout=" << newLayout << "\n");
   // Array length can not be larger than 1 for transpose case.
   return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
                                     tdescType.getBoundaryCheck(),
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d8ce24ddd5cb0..27cf788933f18 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -800,10 +800,17 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
+    LLVM_DEBUG(DBGS() << "StoreDistribution: attempting to match\n");
     Operation *lastNode = warpOp.getTerminator()->getPrevNode();
     auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
-    if (!storeScatterOp)
+    if (!storeScatterOp) {
+      LLVM_DEBUG(
+          DBGS()
+          << "StoreDistribution: last node is not StoreScatterOp, skipping\n");
       return failure();
+    }
+    LLVM_DEBUG(DBGS() << "StoreDistribution: matched StoreScatterOp: "
+                      << *storeScatterOp << "\n");
     auto offsets = storeScatterOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()))
       return rewriter.notifyMatchFailure(
@@ -811,10 +818,15 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
     VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
+    LLVM_DEBUG(DBGS() << "StoreDistribution: offsetsTy=" << offsetsTy
+                      << ", maskTy=" << maskTy << ", storeVecTy=" << storeVecTy
+                      << "\n");
 
     // Add handling for leading unit dimensions support
     int chunkSize = storeScatterOp.getChunkSize().value_or(1);
     int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
+    LLVM_DEBUG(DBGS() << "StoreDistribution: chunkSize=" << chunkSize
+                      << ", effectiveVecRank=" << effectiveVecRank << "\n");
 
     // Check that all leading dimensions are unit dimensions
     for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
@@ -831,6 +843,24 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
     auto layoutMask =
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
+    LLVM_DEBUG({
+      DBGS() << "StoreDistribution: layoutPayload=";
+      if (layoutPayload)
+        DBGS() << layoutPayload;
+      else
+        DBGS() << "(null)";
+      DBGS() << ", layoutOffsets=";
+      if (layoutOffsets)
+        DBGS() << layoutOffsets;
+      else
+        DBGS() << "(null)";
+      DBGS() << ", layoutMask=";
+      if (layoutMask)
+        DBGS() << layoutMask;
+      else
+        DBGS() << "(null)";
+      DBGS() << "\n";
+    });
 
     FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
@@ -849,6 +879,9 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
     VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
     VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
     VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
+    LLVM_DEBUG(DBGS() << "StoreDistribution: distPayloadTy=" << distPayloadTy
+                      << ", distOffsetsTy=" << distOffsetsTy
+                      << ", distMaskTy=" << distMaskTy << "\n");
 
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = storeScatterOp->getOperands();
@@ -885,7 +918,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
         rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
         storeScatterOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
+    LLVM_DEBUG(DBGS() << "StoreDistribution: created new op: " << newOp
+                      << "\n");
     rewriter.eraseOp(storeScatterOp);
+    LLVM_DEBUG(DBGS() << "StoreDistribution: done\n");
     return success();
   }
 };
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 842c2375dd31d..0d1bfd5480aa2 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -473,22 +473,6 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
-// CHECK-LABEL: gpu.func @vector_transpose
-// CHECK:         %[[SRC:.*]] = "some_op"()
-// CHECK:         %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
-// CHECK-NEXT:    %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
-// CHECK-NEXT:    gpu.return
-gpu.func @vector_transpose() {
-  %cst = "some_op"()
-    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
-    : () -> (vector<16x2xf32>)
-  %transpose = vector.transpose %cst, [1, 0]
-    {
-      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-    }
-    : vector<16x2xf32> to vector<2x16xf32>
-  gpu.return
-}
 
 // CHECK-LABEL: gpu.func @vector_bitcast
 // CHECK:         %[[SRC:.*]] = "some_op"()
@@ -1092,7 +1076,8 @@ gpu.module @xevm_module {
 gpu.func @vector_broadcast_2d_to_2d_noop(%laneid: index) {
   %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x1xf16>
   %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
-  "some_use"(%1) : (vector<16x16xf16>) -> ()
+  %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
+  "some_use"(%2) : (vector<16x16xf16>) -> ()
   gpu.return
 }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 9ca424374335f..61b8046bd04e5 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -257,7 +257,7 @@ gpu.module @test_kernel {
 
 // -----
 #l = #xegpu.layout<inst_data = [16, 16]>
-#r = #xegpu.layout<inst_data = [16]>
+#r = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [0]>
 gpu.module @test_kernel  {
   gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<64xf32>
@@ -277,7 +277,7 @@ gpu.module @test_kernel  {
 
 // -----
 #l = #xegpu.layout<inst_data = [16, 16]>
-#r = #xegpu.layout<inst_data = [16]>
+#r = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [1]>
 gpu.module @test_kernel   {
   gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>)  kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
     %c1 = arith.constant 1 : index

>From ac36ceaccbd9bff10bf933ffef9b0b0d1e557cdc Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:24:35 +0000
Subject: [PATCH 5/7] separate recover temporary layout out to another PR

---
 .../XeGPU/Transforms/XeGPULayoutImpl.h        |   5 +-
 .../XeGPU/Transforms/XeGPULayoutImpl.cpp      | 457 +++---------------
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |   2 +-
 .../XeGPU/sg-to-wi-experimental-unit.mlir     |  19 +-
 4 files changed, 73 insertions(+), 410 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 5f46eab7b74c7..9cf9a8705209b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -183,13 +183,10 @@ setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
                 VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
                 const uArch::uArch *uArch);
 
-DistributeLayoutAttr
-inferSourceLayoutFromResult(OpOperand &operand, DistributeLayoutAttr resLayout);
-
 /// Gets the expected layout for a given consumer operand. This will check if
 /// the owning operation of the consumer operand is one of the special layout
 /// users and determine the expected layout accordingly.
-DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
+xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
 
 } // namespace xegpu
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 33c9086566d3c..55cd6ec04970c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -18,22 +18,16 @@
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/ValueRange.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
 
-#define DEBUG_TYPE "xegpu-layout-recovery"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
 using namespace mlir;
 
 void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
@@ -86,321 +80,32 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
   return out;
 }
 
-// Prerequisite for Layout Recovery
-// It relies on the following invariant:
-// 1. there is no layout conflict between different uses of the same definition.
-// 2. each definition has a well-defined layout requirement at its use point.
-//     - Every definition must have at least one use that appears after it in
-//     topological order.
-//     - If a definition has no such use (e.g., a loop result or region output),
-//     an explicit convert_layout operation is inserted to create a use.
-//     - Only the result of convert_layout is permitted to have no subsequent
-//     use.
-
-// The recovery proceeds by scanning the operation in reverse topological order
-// as follows:
-//    For regular operations: First the result layouts are propagated from uses.
-//      Then the result layouts are propagated to operands.
-//
-//    For region operations (e.g., loops):
-//       - When backward propagation reaches a region op, it sets the layout of
-//       the region op’s results according to use points like regular ops.
-//       - Then, the result layouts (such as a loop output) are propagated to
-//       their corresponding operands in the yield.
-//       - When backward propagation reaches the first operation inside the
-//       region, the pass examines the region op’s initialization list,
-//       propagating from region arguments to the corresponding initialization
-//       operands.
-//       - This ensures that layouts are consistently propagated
-//       across region boundaries while preserving a single well-defined use for
-//       each definition at the region-op level.
-
-// the inner function for recoverTemporaryLayouts is a recursive function
-// the input rootOp is the function operation, which is also a region op.
-// it recursivley process the region op in reverse topological order.
-
-static void walkRegionBackward(Region &region,
-                               llvm::function_ref<void(Operation *)> visit) {
-  // blocks: back -> front
-  for (Block &block : llvm::reverse(region)) {
-    // ops: back -> front, early-inc so visit() may erase current op safely
-    for (Operation &op : llvm::reverse(block)) {
-      // make sure we first visit inside the region op (so yield op first)
-      // and then move to region op itself
-      for (Region &nested : llvm::reverse(op.getRegions()))
-        walkRegionBackward(nested, visit);
-
-      visit(&op);
-    }
-  }
-}
-
-static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
-  xegpu::DistributeLayoutAttr layout = nullptr;
-  for (OpOperand &use : result.getUses()) {
-    if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
-      // debug print the use and op, and the tmpLayout
-      LLVM_DEBUG({
-        DBGS() << "getLayoutFromUsePoints  use: " << use.getOwner()->getName()
-               << use.getOwner();
-        llvm::dbgs() << ", tmpLayout=" << tmpLayout << "\n";
-      });
-      // under debug mode, we want to check all the use points to make sure
-      // there is no conflict, so we do not break here. In release mode, we can
-      // break at the first use
-      if (!layout)
-        layout = tmpLayout;
-    }
-  }
-  return layout;
-}
-
-// For regular operations: First the result layouts are propagated from uses.
-// Then the result layouts are propagated to uses (operands).
-static void propagateResultsToRegularOperands(Operation *op) {
-  LLVM_DEBUG(DBGS() << "propagateResultsToRegularOperands: " << op->getName()
-                    << " (" << op->getNumOperands() << " operands, "
-                    << op->getNumResults() << " results)\n");
-
-  if (op->getNumResults() == 0) {
-    LLVM_DEBUG(DBGS() << "  skipping (no results)\n");
-    return;
-  }
-
-  Value result = op->getResult(0);
-  xegpu::DistributeLayoutAttr resLayout =
-      getLayoutFromUsePoints(op->getResult(0));
-  Type resultType = result.getType();
-
-  // recover layout for tensor Descriptor type, which is a special case since
-  // its layout is not stored as an attribute but encoded in the type itself.
-  // For vector type, we attach the layout as an attribute to op.
-  if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
-    auto layout = tensorDescTy.getLayoutAttr();
-    // TODO: remove the layout check. The tensorDescType's layout is treated as
-    // temporary layout, which needs to be set by layout recovery.
-    // allow it now to pass some legacy test case
-    if (!layout) {
-      auto typeWithLayout = xegpu::TensorDescType::get(
-          tensorDescTy.getContext(), tensorDescTy.getShape(),
-          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
-      result.setType(typeWithLayout);
-    }
-  }
-
-  for (OpOperand &opr : op->getOpOperands()) {
-    // Layouts are needed for vector type only.
-    xegpu::DistributeLayoutAttr operandLayout =
-        xegpu::inferSourceLayoutFromResult(opr, resLayout);
-    if (!isa<VectorType>(opr.get().getType())) {
-      LLVM_DEBUG(DBGS() << "  operand #" << opr.getOperandNumber()
-                        << ": skipped (non-vector type: " << opr.get().getType()
-                        << ")\n");
-      continue;
-    }
-
-    xegpu::setTemporaryLayout(opr, operandLayout);
-    // debug print op
-    LLVM_DEBUG(DBGS() << "after propagateResultsToRegularOperands  op: "
-                      << op->getName() << op << "  operand #"
-                      << opr.getOperandNumber()
-                      << ": type=" << opr.get().getType());
-    llvm::dbgs() << ", temp Layout=" << xegpu::getTemporaryLayout(opr);
-    llvm::dbgs() << "\n";
-  }
-}
-
-static void propagateRegionResultsToYieldOperands(
-    mlir::RegionBranchTerminatorOpInterface yieldOp) {
-  LLVM_DEBUG(DBGS() << "propagateRegionResultsToYieldOperands: "
-                    << yieldOp->getName() << " (" << yieldOp->getNumOperands()
-                    << " operands), parent="
-                    << yieldOp->getParentOp()->getName() << "\n");
-
-  if (isa<func::FuncOp>(yieldOp->getParentOp())) {
-    LLVM_DEBUG(DBGS() << "  skipping (parent is FuncOp)\n");
-    return;
-  }
-
-  auto regionBranchOp =
-      dyn_cast<RegionBranchOpInterface>(yieldOp->getParentOp());
-  if (!regionBranchOp) {
-    LLVM_DEBUG(DBGS() << "  skipping (parent is not RegionBranchOp)\n");
-    return;
-  }
-
-  // Gather layouts for each result of the parent region op from external
-  // use points.
-  unsigned numResults = regionBranchOp->getNumResults();
-  LLVM_DEBUG(DBGS() << "  parent op has " << numResults << " results\n");
-  if (numResults == 0)
-    return;
-
-  SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
-  for (unsigned i = 0; i < numResults; ++i) {
-    OpResult result = regionBranchOp->getResult(i);
-    resultLayouts[i] = getLayoutFromUsePoints(result);
-    if (resultLayouts[i]) {
-      LLVM_DEBUG(DBGS() << "  result #" << i << ": type=" << result.getType()
-                        << ", layout=" << resultLayouts[i] << "\n");
-      xegpu::setTemporaryLayout(result, resultLayouts[i]);
-    } else {
-      LLVM_DEBUG(DBGS() << "  result #" << i
-                        << ": skipped (no layout from use points)\n");
-    }
-  }
-
-  // Use getSuccessorOperands to find which operands of the terminator
-  // flow to a successor. This handles index offsets automatically (e.g.,
-  // scf.condition's predicate at operand #0 is excluded).
-  // Pick the first successor to determine the operand range.
-  SmallVector<RegionSuccessor> successors;
-  SmallVector<Attribute> operandAttrs(yieldOp->getNumOperands(), nullptr);
-  yieldOp.getSuccessorRegions(operandAttrs, successors);
-  assert(!successors.empty() && "terminator must have at least one successor");
-
-  OperandRange succOps = yieldOp.getSuccessorOperands(successors.front());
-  unsigned beginIdx = succOps.getBeginOperandIndex();
-  unsigned count = std::min(static_cast<unsigned>(succOps.size()), numResults);
-
-  LLVM_DEBUG(DBGS() << "  " << count << " successor operands starting at index "
-                    << beginIdx << "\n");
-
-  for (unsigned i = 0; i < count; ++i) {
-    if (!resultLayouts[i])
-      continue;
-    LLVM_DEBUG(DBGS() << "    -> setting layout on operand #" << (beginIdx + i)
-                      << "\n");
-    xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i),
-                              resultLayouts[i]);
-  }
-
-  LLVM_DEBUG({
-    DBGS() << " after propagateRegionResultsToYieldOperands:\n";
-    yieldOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
-    llvm::dbgs() << "\n";
-  });
-}
-
-static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
-  LLVM_DEBUG(DBGS() << "propagateRegionArgsToInits: " << regionOp->getName()
-                    << " (" << regionOp->getNumOperands() << " operands, "
-                    << regionOp->getNumRegions() << " regions)\n");
-  LLVM_DEBUG({
-    DBGS() << " before propagateRegionArgsToInits, Region IR:\n";
-    regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
-    llvm::dbgs() << "\n";
-  });
-
-  // Iterate all regions of the region op. For each block argument that has a
-  // layout (determined from its use points), trace back to find the
-  // corresponding init operand of the regionOp and set the layout on it.
-  // This works generically for scf.for, scf.while, and other
-  // RegionBranchOpInterface ops.
-  for (Region &region : regionOp->getRegions()) {
-    RegionSuccessor regionSuccessor(&region);
-    for (auto [argIdx, regionArg] : llvm::enumerate(region.getArguments())) {
-      auto layout = getLayoutFromUsePoints(regionArg);
+// Attach layout attributes to all vector-type operands of operations within
+// the given operation's region. Reports an error if any vector operand lacks
+// a layout attribute.
+bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+  auto result = rootOp->walk([&](Operation *op) {
+    for (OpOperand &operand : op->getOpOperands()) {
+      // Layouts are needed for vector type only.
+      if (!isa<VectorType>(operand.get().getType()))
+        continue;
+      // Skip block arguments since they don't have defining ops to attach
+      // layout attributes to.
+      if (isa<BlockArgument>(operand.get()))
+        continue;
+      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
-        LLVM_DEBUG(DBGS() << "  region #" << region.getRegionNumber()
-                          << " arg #" << argIdx << ": skipped (no layout)\n");
+        op->emitWarning("Could not find layout attribute for operand ")
+            << operand.getOperandNumber() << " of operation " << op->getName();
         continue;
       }
-      LLVM_DEBUG(DBGS() << "  region #" << region.getRegionNumber() << " arg #"
-                        << argIdx << ": type=" << regionArg.getType()
-                        << ", layout=" << layout << "\n");
-
-      // Find all predecessor values that flow into this block argument.
-      SmallVector<Value> predValues;
-      regionOp.getPredecessorValues(regionSuccessor, argIdx - 1, predValues);
-      for (Value predVal : predValues) {
-        // Match predecessor value to an operand of the regionOp.
-        for (OpOperand &operand : regionOp->getOpOperands()) {
-          if (operand.get() == predVal) {
-            LLVM_DEBUG(DBGS() << "    -> setting layout on init operand #"
-                              << operand.getOperandNumber() << "\n");
-            xegpu::setTemporaryLayout(operand, layout);
-          }
-        }
-      }
+      xegpu::setTemporaryLayout(operand, layout);
     }
-  }
-
-  LLVM_DEBUG({
-    DBGS() << " after propagateRegionArgsToInits, Region IR:\n";
-    regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
-    llvm::dbgs() << "\n";
-  });
-}
-
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
-  LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts START ===\n");
-
-  auto processFunc = [&](Region &body, StringRef funcName) {
-    LLVM_DEBUG(DBGS() << "Processing func: " << funcName << "\n");
-    walkRegionBackward(body, [&](Operation *op) {
-      LLVM_DEBUG(DBGS() << "Visiting op: " << op->getName());
-      if (auto regionOp = dyn_cast<mlir::RegionBranchOpInterface>(op)) {
-        // hit the region op after visiting inside region
-        LLVM_DEBUG(DBGS() << "  -> dispatching as RegionBranchOp\n");
-        propagateRegionArgsToInits(regionOp);
-      } else if (auto yieldOp =
-                     dyn_cast<mlir::RegionBranchTerminatorOpInterface>(op)) {
-        // yield op inside region op
-        LLVM_DEBUG(DBGS() << "  -> dispatching as YieldOp\n");
-        propagateRegionResultsToYieldOperands(yieldOp);
-      } else if (!dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
-        // if the op is regular op, calling propagateResultsToRegularOperands
-        LLVM_DEBUG(DBGS() << "  -> dispatching as regular op\n");
-        propagateResultsToRegularOperands(op);
-      }
-    });
-  };
-
-  rootOp->walk([&](func::FuncOp func) {
-    processFunc(func.getBody(), func.getSymName());
-  });
-  rootOp->walk([&](gpu::GPUFuncOp func) {
-    processFunc(func.getBody(), func.getName());
-  });
-
-  LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts END ===\n");
-  // print the root op after
-  LLVM_DEBUG({
-    DBGS() << "After recoverTemporaryLayouts, IR:\n";
-    rootOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
-    llvm::dbgs() << "\n";
+    return WalkResult::advance();
   });
-  return true;
+  return !result.wasInterrupted();
 }
 
-// // Attach layout attributes to all vector-type operands of operations within
-// // the given operation's region. Reports an error if any vector operand lacks
-// // a layout attribute.
-// bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
-//   auto result = rootOp->walk([&](Operation *op) {
-//     for (OpOperand &operand : op->getOpOperands()) {
-//       // Layouts are needed for vector type only.
-//       if (!isa<VectorType>(operand.get().getType()))
-//         continue;
-//       // Skip block arguments since they don't have defining ops to attach
-//       // layout attributes to.
-//       if (isa<BlockArgument>(operand.get()))
-//         continue;
-//       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
-//       if (!layout) {
-//         op->emitWarning("Could not find layout attribute for operand ")
-//             << operand.getOperandNumber() << " of operation " <<
-//             op->getName();
-//         xegpu::setTemporaryLayout(operand, layout);
-//         continue;
-//       }
-//     }
-//     return WalkResult::advance();
-//   });
-//   return !result.wasInterrupted();
-// }
-
 template <typename T, typename>
 void xegpu::removeLayoutAttr(const T &operandOrResult) {
   Operation *owner = operandOrResult.getOwner();
@@ -1403,153 +1108,99 @@ xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
   return std::nullopt;
 }
 
-xegpu::DistributeLayoutAttr
-xegpu::inferSourceLayoutFromResult(OpOperand &operand,
-                                   xegpu::DistributeLayoutAttr resLayout) {
-  if (!resLayout) {
-    LLVM_DEBUG(DBGS() << "no resLayout, returning null\n");
-    return xegpu::DistributeLayoutAttr();
-  }
+xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
   Operation *op = operand.getOwner();
   unsigned idx = operand.getOperandNumber();
+  xegpu::DistributeLayoutAttr resLayout;
+  if (op->getNumResults() == 1)
+    resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
 
   // For vector::BroadcastOp, infer the source layout from the result layout.
   if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
-    LLVM_DEBUG(DBGS() << "  -> BroadcastOp\n");
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
     auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
-    if (!srcTy) {
-      LLVM_DEBUG(DBGS() << "     source is not VectorType, returning null\n");
+    if (!srcTy)
       return xegpu::DistributeLayoutAttr();
-    }
-    auto inferred = xegpu::inferBroadcastSourceLayout(
+    return xegpu::inferBroadcastSourceLayout(
         resLayout, broadcast.getResultVectorType().getShape(),
         srcTy.getShape());
-    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
-    return inferred;
   }
 
   // For vector::MultiDimReductionOp, infer source layout from result layout
   // using reduction dims. Acc operand is expected to have the same layout as
   // the result.
   if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
-    LLVM_DEBUG(DBGS() << "  -> MultiDimReductionOp, operand idx=" << idx
-                      << "\n");
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
     if (idx == 0) {
       SmallVector<int64_t> reductionDims(reduction.getReductionDims());
-      LLVM_DEBUG({
-        DBGS() << "     reductionDims=[";
-        llvm::interleaveComma(reductionDims, llvm::dbgs());
-        llvm::dbgs() << "]\n";
-      });
-      auto inferred =
-          xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
-      LLVM_DEBUG(DBGS() << "     inferred source layout=" << inferred << "\n");
-      return inferred;
+      return xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
     }
-    if (idx == 1) {
-      LLVM_DEBUG(DBGS() << "     acc operand, using resLayout\n");
+    if (idx == 1)
       return resLayout;
-    }
   }
 
   if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
-    LLVM_DEBUG(DBGS() << "  -> ReductionOp\n");
-    auto inferred = xegpu::inferReductionSourceLayout(resLayout);
-    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
-    return inferred;
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
+    return xegpu::inferReductionSourceLayout(resLayout);
   }
 
   // For vector::BitCastOp, infer source layout from result layout using
   // element type bitwidths.
   if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
-    LLVM_DEBUG(DBGS() << "  -> BitCastOp\n");
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
     int resElemBitWidth =
         bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
     int srcElemBitWidth =
         bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
-    LLVM_DEBUG(DBGS() << "     resBitWidth=" << resElemBitWidth
-                      << ", srcBitWidth=" << srcElemBitWidth << "\n");
-    auto inferred = xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
-                                                    srcElemBitWidth);
-    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
-    return inferred;
+    return xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
+                                           srcElemBitWidth);
   }
 
   // For vector::ShapeCastOp, infer source layout from result layout using
   // shapes.
   if (auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
-    LLVM_DEBUG({
-      DBGS() << "  -> ShapeCastOp: resShape=[";
-      llvm::interleaveComma(shapeCast.getResultVectorType().getShape(),
-                            llvm::dbgs());
-      llvm::dbgs() << "], srcShape=[";
-      llvm::interleaveComma(shapeCast.getSourceVectorType().getShape(),
-                            llvm::dbgs());
-      llvm::dbgs() << "]\n";
-    });
-    auto inferred = xegpu::inferShapeCastSourceLayout(
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
+    return xegpu::inferShapeCastSourceLayout(
         resLayout, shapeCast.getResultVectorType().getShape(),
         shapeCast.getSourceVectorType().getShape());
-    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
-    return inferred;
   }
 
   // For vector::InsertStridedSliceOp, infer source layout from result layout.
   // Dest vector must have the same layout as the result.
   if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
-    LLVM_DEBUG(DBGS() << "  -> InsertStridedSliceOp, operand idx=" << idx
-                      << "\n");
-    if (idx == 0) {
-      auto inferred = xegpu::inferInsertStridedSliceSourceLayout(
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
+    if (idx == 0)
+      return xegpu::inferInsertStridedSliceSourceLayout(
           resLayout, insertSlice.getDestVectorType().getShape(),
           insertSlice.getSourceVectorType().getShape());
-      LLVM_DEBUG(DBGS() << "     inferred source layout=" << inferred << "\n");
-      return inferred;
-    }
-    if (idx == 1) {
-      LLVM_DEBUG(DBGS() << "     dest operand, using resLayout\n");
+    if (idx == 1)
       return resLayout;
-    }
   }
 
   // For vector::TransposeOp, infer source layout from result layout using
   // permutation.
   if (auto transpose = dyn_cast<vector::TransposeOp>(op)) {
-    LLVM_DEBUG({
-      DBGS() << "  -> TransposeOp, perm=[";
-      llvm::interleaveComma(transpose.getPermutation(), llvm::dbgs());
-      llvm::dbgs() << "]\n";
-    });
-    auto inferred = xegpu::inferTransposeSourceLayout(
-        resLayout, transpose.getPermutation());
-    LLVM_DEBUG(DBGS() << "     inferred=" << inferred << "\n");
-    return inferred;
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
+    return xegpu::inferTransposeSourceLayout(resLayout,
+                                             transpose.getPermutation());
   }
 
   // For elementwise operations, all operands must have the same layout as the
   // result.
   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
-    LLVM_DEBUG(DBGS() << "  -> elementwise op, using resLayout="
-                      << (resLayout ? resLayout : nullptr) << "\n");
-
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
     return resLayout;
   }
-  return xegpu::DistributeLayoutAttr();
-}
-
-xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
-  Operation *op = operand.getOwner();
-  xegpu::DistributeLayoutAttr resLayout;
-  if (op->getNumResults() == 1)
-    resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
-  auto inferredOperandLayout = inferSourceLayoutFromResult(operand, resLayout);
-  if (inferredOperandLayout)
-    return inferredOperandLayout;
+  // TODO: Handle more cases as needed here.
   // By default, assume no layout conflict and return the current layout of
   // the operand.
-  auto fallback = xegpu::getDistributeLayoutAttr(operand.get());
-  LLVM_DEBUG(DBGS() << "  -> fallback (unhandled op " << op->getName()
-                    << "), returning operand layout="
-                    << (fallback ? fallback : nullptr) << "\n");
-  return fallback;
+  return xegpu::getDistributeLayoutAttr(operand.get());
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index f0ff771f4cbc4..4c30dacae8850 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1338,7 +1338,7 @@ LogicalResult ResolveLayoutConflicts::run() {
     // as anchor op for the reduction op's layout.
     if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
       for (OpResult result : op->getResults()) {
-        if (result.getType().isIntOrFloat() || result.use_empty()) {
+        if (result.getType().isIntOrFloat()) {
           auto res = assignResultLayout(result);
           if (failed(res)) {
             DBGS() << "Failed to resolve vector consumer for multi-reduction "
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 0d1bfd5480aa2..842c2375dd31d 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -473,6 +473,22 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_transpose
+// CHECK:         %[[SRC:.*]] = "some_op"()
+// CHECK:         %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
+// CHECK-NEXT:    %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+// CHECK-NEXT:    gpu.return
+gpu.func @vector_transpose() {
+  %cst = "some_op"()
+    {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
+    : () -> (vector<16x2xf32>)
+  %transpose = vector.transpose %cst, [1, 0]
+    {
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    : vector<16x2xf32> to vector<2x16xf32>
+  gpu.return
+}
 
 // CHECK-LABEL: gpu.func @vector_bitcast
 // CHECK:         %[[SRC:.*]] = "some_op"()
@@ -1076,8 +1092,7 @@ gpu.module @xevm_module {
 gpu.func @vector_broadcast_2d_to_2d_noop(%laneid: index) {
   %0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x1xf16>
   %1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
-  %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
-  "some_use"(%2) : (vector<16x16xf16>) -> ()
+  "some_use"(%1) : (vector<16x16xf16>) -> ()
   gpu.return
 }
 }

>From 1328c5ff1981598a4ed9ff102f1ac17360cbd6c4 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:37:55 +0000
Subject: [PATCH 6/7] cleanup

---
 .../Transforms/XeGPUPeepHoleOptimizer.cpp     | 15 +---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 38 +---------
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 74 +++----------------
 3 files changed, 12 insertions(+), 115 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index c43eaba5b3ee6..c488bca363da6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -28,7 +28,6 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
 #include <optional>
 
 namespace mlir {
@@ -151,22 +150,12 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
   auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
   SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
                                      laneLayoutI64.end());
-  LLVM_DEBUG({
-    DBGS() << "tryOptimize: origLayout=" << origLayout << "\n";
-    DBGS() << "  laneLayoutI32=[";
-    llvm::interleaveComma(laneLayoutI32, llvm::dbgs());
-    llvm::dbgs() << "], laneData=[1, 1]";
-    if (origLayout.getOrder())
-      llvm::dbgs() << ", order=" << origLayout.getOrder();
-    llvm::dbgs() << "\n";
-    DBGS() << "  supportedShape=[" << supportedHeight << ", " << supportedWidth
-           << "], newElemTy=" << newElemTy << ", arrayLen=" << arrayLen << "\n";
-  });
+
   xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
       ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
       /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
       /*order=*/origLayout.getOrder());
-  LLVM_DEBUG(DBGS() << "  newLayout=" << newLayout << "\n");
+
   // Array length can not be larger than 1 for transpose case.
   return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
                                     tdescType.getBoundaryCheck(),
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 27cf788933f18..d8ce24ddd5cb0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -800,17 +800,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    LLVM_DEBUG(DBGS() << "StoreDistribution: attempting to match\n");
     Operation *lastNode = warpOp.getTerminator()->getPrevNode();
     auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
-    if (!storeScatterOp) {
-      LLVM_DEBUG(
-          DBGS()
-          << "StoreDistribution: last node is not StoreScatterOp, skipping\n");
+    if (!storeScatterOp)
       return failure();
-    }
-    LLVM_DEBUG(DBGS() << "StoreDistribution: matched StoreScatterOp: "
-                      << *storeScatterOp << "\n");
     auto offsets = storeScatterOp.getOffsets();
     if (!offsets || !isa<VectorType>(offsets.getType()))
       return rewriter.notifyMatchFailure(
@@ -818,15 +811,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
     VectorType offsetsTy = cast<VectorType>(offsets.getType());
     VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
     VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
-    LLVM_DEBUG(DBGS() << "StoreDistribution: offsetsTy=" << offsetsTy
-                      << ", maskTy=" << maskTy << ", storeVecTy=" << storeVecTy
-                      << "\n");
 
     // Add handling for leading unit dimensions support
     int chunkSize = storeScatterOp.getChunkSize().value_or(1);
     int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
-    LLVM_DEBUG(DBGS() << "StoreDistribution: chunkSize=" << chunkSize
-                      << ", effectiveVecRank=" << effectiveVecRank << "\n");
 
     // Check that all leading dimensions are unit dimensions
     for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
@@ -843,24 +831,6 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
     auto layoutMask =
         xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
-    LLVM_DEBUG({
-      DBGS() << "StoreDistribution: layoutPayload=";
-      if (layoutPayload)
-        DBGS() << layoutPayload;
-      else
-        DBGS() << "(null)";
-      DBGS() << ", layoutOffsets=";
-      if (layoutOffsets)
-        DBGS() << layoutOffsets;
-      else
-        DBGS() << "(null)";
-      DBGS() << ", layoutMask=";
-      if (layoutMask)
-        DBGS() << layoutMask;
-      else
-        DBGS() << "(null)";
-      DBGS() << "\n";
-    });
 
     FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
@@ -879,9 +849,6 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
     VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
     VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
     VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
-    LLVM_DEBUG(DBGS() << "StoreDistribution: distPayloadTy=" << distPayloadTy
-                      << ", distOffsetsTy=" << distOffsetsTy
-                      << ", distMaskTy=" << distMaskTy << "\n");
 
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = storeScatterOp->getOperands();
@@ -918,10 +885,7 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
         rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
         storeScatterOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
-    LLVM_DEBUG(DBGS() << "StoreDistribution: created new op: " << newOp
-                      << "\n");
     rewriter.eraseOp(storeScatterOp);
-    LLVM_DEBUG(DBGS() << "StoreDistribution: done\n");
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 55cf47e38dfd0..bcac517937754 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -23,14 +23,10 @@
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <numeric>
 
-#define DEBUG_TYPE "xegpu-utils"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
 using namespace mlir;
 
 /// convert ArrayRef<ValueRange> into SmallVector<Value>
@@ -149,31 +145,19 @@ std::string xegpu::getTemporaryLayoutName(const OpResult result) {
 }
 
 xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
-  LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(Value): type="
-                    << value.getType() << "\n");
-  if (!value) {
-    LLVM_DEBUG(DBGS() << "  -> null value, returning nullptr\n");
+  if (!value)
     return nullptr;
-  }
 
   if (auto tdescTy =
-          dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
-    auto layout = tdescTy.getLayoutAttr();
-    LLVM_DEBUG(DBGS() << "  -> TensorDescType, layout="
-                      << (layout ? layout : nullptr) << "\n");
-    return layout;
-  }
+          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
+    return tdescTy.getLayoutAttr();
 
   if (auto result = dyn_cast<OpResult>(value)) {
     Operation *defOp = result.getDefiningOp();
     assert(defOp && "result must have a defining op");
-    LLVM_DEBUG(DBGS() << "  OpResult #" << result.getResultNumber() << " from "
-                      << defOp->getName() << "\n");
 
     if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
       auto layout = anchorOp.getAnchorLayout();
-      LLVM_DEBUG(DBGS() << "  -> AnchorLayoutInterface, layout="
-                        << (layout ? layout : nullptr) << "\n");
       return layout;
     }
 
@@ -181,100 +165,60 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     if (defOp->hasAttr(layoutName)) {
       auto layout =
           defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-      LLVM_DEBUG(DBGS() << "  -> temporary attr '" << layoutName
-                        << "', layout=" << layout << "\n");
       return layout;
     }
-    LLVM_DEBUG(DBGS() << "  -> OpResult: no layout found (checked '"
-                      << layoutName << "')\n");
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
     auto *parentOp = arg.getOwner()->getParentOp();
-    LLVM_DEBUG(DBGS() << "  BlockArgument #" << arg.getArgNumber() << " of "
-                      << (parentOp ? parentOp->getName().getStringRef()
-                                   : StringRef("(null)"))
-                      << "\n");
     if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
       if (tiedInit) {
-        LLVM_DEBUG(DBGS() << "  -> LoopLikeOp, recursing into tiedInit "
-                          << "operand #" << tiedInit->getOperandNumber()
-                          << "\n");
         return getDistributeLayoutAttr(tiedInit->get());
       }
-      LLVM_DEBUG(DBGS() << "  -> LoopLikeOp, no tiedInit\n");
     }
   }
 
-  LLVM_DEBUG(DBGS() << "  -> returning nullptr\n");
   return nullptr;
 }
 xegpu::DistributeLayoutAttr
 xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   Operation *op = opr.getOwner();
   unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
-  LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(OpOperand): operand #" << idx
-                    << " of " << op->getName()
-                    << ", type=" << opr.get().getType() << "\n");
 
   if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
     if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
       if (idx == 0) {
-        auto layout = dpasOp.getLayoutAAttr();
-        LLVM_DEBUG(DBGS() << "  -> DpasOp layoutA="
-                          << (layout ? layout : nullptr) << "\n");
-        return layout;
+        return dpasOp.getLayoutAAttr();
       } else if (idx == 1) {
-        auto layout = dpasOp.getLayoutBAttr();
-        LLVM_DEBUG(DBGS() << "  -> DpasOp layoutB="
-                          << (layout ? layout : nullptr) << "\n");
-        return layout;
+        return dpasOp.getLayoutBAttr();
       } else if (idx == 2) {
-        auto layout = dpasOp.getLayoutCdAttr();
-        LLVM_DEBUG(DBGS() << "  -> DpasOp layoutCd="
-                          << (layout ? layout : nullptr) << "\n");
-        return layout;
+        return dpasOp.getLayoutCdAttr();
       }
     }
     if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
-      auto layout = convertOp.getInputLayoutAttr();
-      LLVM_DEBUG(DBGS() << "  -> ConvertLayoutOp inputLayout="
-                        << (layout ? layout : nullptr) << "\n");
-      return layout;
+      return convertOp.getInputLayoutAttr();
     }
     auto layout = anchorOp.getAnchorLayout();
 
-    if (idx == 0) {
-      LLVM_DEBUG(DBGS() << "  -> AnchorLayoutInterface idx=0, layout="
-                        << (layout ? layout : nullptr) << "\n");
+    if (idx == 0)
       return layout;
-    }
 
     // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
     // the layout is valid for the first two operands: value and memref/tdesc.
     // For other operations, the layout applies to the first operand only.
     if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
             op) &&
-        (idx < 2)) {
-      LLVM_DEBUG(DBGS() << "  -> Store op idx=" << idx
-                        << ", layout=" << (layout ? layout : nullptr) << "\n");
+        (idx < 2))
       return layout;
-    }
-    LLVM_DEBUG(DBGS() << "  -> AnchorLayoutInterface idx=" << idx
-                      << " not covered, falling through\n");
   }
 
   std::string layoutName = xegpu::getTemporaryLayoutName(opr);
   if (op->hasAttr(layoutName)) {
     auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
-    LLVM_DEBUG(DBGS() << "  -> temporary attr '" << layoutName
-                      << "', layout=" << layout << "\n");
     return layout;
   }
 
-  LLVM_DEBUG(DBGS() << "  -> returning nullptr (checked '" << layoutName
-                    << "')\n");
   return nullptr;
 }
 

>From 2617a0258e5435f141292f85c5633a8574bdaebd Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:41:17 +0000
Subject: [PATCH 7/7] cleanup

---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index bcac517937754..f0508a30621f2 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -173,9 +173,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     auto *parentOp = arg.getOwner()->getParentOp();
     if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
       OpOperand *tiedInit = loop.getTiedLoopInit(arg);
-      if (tiedInit) {
+      if (tiedInit)
         return getDistributeLayoutAttr(tiedInit->get());
-      }
     }
   }