[Mlir-commits] [mlir] [MLIR][XeGPU] Recover temporary layout from Anchor Layout (PR #191947)

Mon Apr 20 20:02:54 PDT 2026

================
@@ -80,30 +82,199 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
   return out;
 }
 
-// Attach layout attributes to all vector-type operands of operations within
-// the given operation's region. Reports an error if any vector operand lacks
-// a layout attribute.
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
-  auto result = rootOp->walk([&](Operation *op) {
-    for (OpOperand &operand : op->getOpOperands()) {
-      // Layouts are needed for vector type only.
-      if (!isa<VectorType>(operand.get().getType()))
-        continue;
-      // Skip block arguments since they don't have defining ops to attach
-      // layout attributes to.
-      if (isa<BlockArgument>(operand.get()))
-        continue;
-      auto layout = xegpu::getDistributeLayoutAttr(operand.get());
-      if (!layout) {
-        op->emitWarning("Could not find layout attribute for operand ")
-            << operand.getOperandNumber() << " of operation " << op->getName();
+// Prerequisite for Layout Recovery
+// It relies on the following invariant:
+// 1. there is no layout conflict between different uses of the same definition.
+// 2. each definition has a well-defined layout requirement at its use point.
+//     - Every definition must have at least one use that appears after it in
+//     topological order.
+//     - If a definition has no such use (e.g., a loop result or region output),
+//     an explicit convert_layout operation is inserted to create a use.
+//     - Only the result of convert_layout is permitted to have no subsequent
+//     use.
+
+// The recovery proceeds by scanning the operation in reverse topological order
+// as follows:
+//    For regular operations: First the result layouts are propagated from uses.
+//      Then the result layouts are propagated to operands.
+//
+//    For region operations (e.g., loops):
+//       - When backward propagation reaches a region op, it sets the layout of
+//       the region op’s results according to use points like regular ops.
+//       - Then, the result layouts (such as a loop output) are propagated to
+//       their corresponding operands in the yield.
+//       - When backward propagation reaches the first operation inside the
+//       region, the pass examines the region op’s initialization list,
+//       propagating from region arguments to the corresponding initialization
+//       operands.
+//       - This ensures that layouts are consistently propagated
+//       across region boundaries while preserving a single well-defined use for
+//       each definition at the region-op level.
+
+// the inner function for recoverTemporaryLayouts is a recursive function
+// the input rootOp is the function operation, which is also a region op.
+// it recursivley process the region op in reverse topological order.
+
+static void walkRegionBackward(Region &region,
+                               llvm::function_ref<void(Operation *)> visit) {
+  // blocks: back -> front
+  for (Block &block : llvm::reverse(region)) {
+    // ops: back -> front, early-inc so visit() may erase current op safely
+    for (Operation &op : llvm::reverse(block)) {
+      // make sure we first visit inside the region op (so yield op first)
+      // and then move to region op itself
+      for (Region &nested : llvm::reverse(op.getRegions()))
+        walkRegionBackward(nested, visit);
+
+      visit(&op);
+    }
+  }
+}
+
+static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
+  xegpu::DistributeLayoutAttr layout = nullptr;
+  for (OpOperand &use : result.getUses()) {
+    if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
+      if (!layout)
+        layout = tmpLayout;
+    }
+  }
+  return layout;
+}
+
+// For regular operations: First the result layouts are propagated from uses.
+// Then the result layouts are propagated to uses (operands).
+static void propagateResultsToRegularOperands(Operation *op) {
+  if (op->getNumResults() == 0)
+    return;
+
+  OpResult result = op->getResult(0);
+  xegpu::DistributeLayoutAttr resLayout = getLayoutFromUsePoints(result);
+  Type resultType = result.getType();
+
+  // recover layout for tensor Descriptor type, which is a special case since
+  // its layout is not stored as an attribute but encoded in the type itself.
+  // For vector type, we attach the layout as an attribute to op.
+  if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+    auto layout = tensorDescTy.getLayoutAttr();
+    // TODO: remove the layout check. The tensorDescType's layout is treated as
+    // temporary layout, which needs to be set by layout recovery.
+    // allow it now to pass some legacy test case
+    if (!layout) {
+      auto typeWithLayout = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
+      result.setType(typeWithLayout);
+    }
+  }
+
+  xegpu::setTemporaryLayout(result, resLayout);
+
+  for (OpOperand &opr : op->getOpOperands()) {
+    // Layouts are needed for vector type only.
+    xegpu::DistributeLayoutAttr operandLayout =
+        xegpu::inferSourceLayoutFromResult(opr, resLayout);
+    if (!isa<VectorType>(opr.get().getType()))
+      continue;
+
+    xegpu::setTemporaryLayout(opr, operandLayout);
+  }
+}
+
+static void propagateRegionResultsToYieldOperands(
+    mlir::RegionBranchTerminatorOpInterface yieldOp) {
+  if (isa<func::FuncOp>(yieldOp->getParentOp()))
+    return;
+
+  auto regionBranchOp =
+      dyn_cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+  if (!regionBranchOp)
+    return;
+
+  // Gather layouts for each result of the parent region op from external
+  // use points.
+  unsigned numResults = regionBranchOp->getNumResults();
+  if (numResults == 0)
+    return;
+
+  SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
+  for (unsigned i = 0; i < numResults; ++i) {
+    OpResult result = regionBranchOp->getResult(i);
+    resultLayouts[i] = getLayoutFromUsePoints(result);
+    if (resultLayouts[i])
+      xegpu::setTemporaryLayout(result, resultLayouts[i]);
+  }
+
+  // Use getSuccessorOperands to find which operands of the terminator
+  // flow to a successor. This handles index offsets automatically (e.g.,
+  // scf.condition's predicate at operand #0 is excluded).
+  // Pick the first successor to determine the operand range.
+  SmallVector<RegionSuccessor> successors;
+  SmallVector<Attribute> operandAttrs(yieldOp->getNumOperands(), nullptr);
+  yieldOp.getSuccessorRegions(operandAttrs, successors);
+  assert(!successors.empty() && "terminator must have at least one successor");
+
+  OperandRange succOps = yieldOp.getSuccessorOperands(successors.front());
+  unsigned beginIdx = succOps.getBeginOperandIndex();
+  unsigned count = std::min(static_cast<unsigned>(succOps.size()), numResults);
+
+  for (unsigned i = 0; i < count; ++i) {
+    if (!resultLayouts[i])
+      continue;
+    xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i),
+                              resultLayouts[i]);
+  }
+}
+
+static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
+  // Iterate all regions of the region op. For each block argument that has a
+  // layout (determined from its use points), trace back to find the
+  // corresponding init operand of the regionOp and set the layout on it.
+  // This works generically for scf.for, scf.while, and other
+  // RegionBranchOpInterface ops.
+  for (Region &region : regionOp->getRegions()) {
+    RegionSuccessor regionSuccessor(&region);
+    for (auto [argIdx, regionArg] : llvm::enumerate(region.getArguments())) {
+      auto layout = getLayoutFromUsePoints(regionArg);
+      if (!layout)
         continue;
+
+      // Find all predecessor values that flow into this block argument.
+      SmallVector<Value> predValues;
----------------
Jianhui-Li wrote:

It contains both init_args and iter_args but only init_args are processed and set operand layout

https://github.com/llvm/llvm-project/pull/191947