[Mlir-commits] [mlir] [MLIR][XeGPU] Recover temporary layout from Anchor Layout (PR #191947)
Jianhui Li
llvmlistbot at llvm.org
Mon Apr 20 20:02:54 PDT 2026
================
@@ -80,30 +82,199 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
return out;
}
-// Attach layout attributes to all vector-type operands of operations within
-// the given operation's region. Reports an error if any vector operand lacks
-// a layout attribute.
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
- auto result = rootOp->walk([&](Operation *op) {
- for (OpOperand &operand : op->getOpOperands()) {
- // Layouts are needed for vector type only.
- if (!isa<VectorType>(operand.get().getType()))
- continue;
- // Skip block arguments since they don't have defining ops to attach
- // layout attributes to.
- if (isa<BlockArgument>(operand.get()))
- continue;
- auto layout = xegpu::getDistributeLayoutAttr(operand.get());
- if (!layout) {
- op->emitWarning("Could not find layout attribute for operand ")
- << operand.getOperandNumber() << " of operation " << op->getName();
+// Prerequisite for Layout Recovery
+// It relies on the following invariant:
+// 1. there is no layout conflict between different uses of the same definition.
+// 2. each definition has a well-defined layout requirement at its use point.
+// - Every definition must have at least one use that appears after it in
+// topological order.
+// - If a definition has no such use (e.g., a loop result or region output),
+// an explicit convert_layout operation is inserted to create a use.
+// - Only the result of convert_layout is permitted to have no subsequent
+// use.
+
+// The recovery proceeds by scanning the operation in reverse topological order
+// as follows:
+// For regular operations: First the result layouts are propagated from uses.
+// Then the result layouts are propagated to operands.
+//
+// For region operations (e.g., loops):
+// - When backward propagation reaches a region op, it sets the layout of
+// the region op’s results according to use points like regular ops.
+// - Then, the result layouts (such as a loop output) are propagated to
+// their corresponding operands in the yield.
+// - When backward propagation reaches the first operation inside the
+// region, the pass examines the region op’s initialization list,
+// propagating from region arguments to the corresponding initialization
+// operands.
+// - This ensures that layouts are consistently propagated
+// across region boundaries while preserving a single well-defined use for
+// each definition at the region-op level.
+
+// the inner function for recoverTemporaryLayouts is a recursive function
+// the input rootOp is the function operation, which is also a region op.
+// it recursivley process the region op in reverse topological order.
+
+static void walkRegionBackward(Region ®ion,
+ llvm::function_ref<void(Operation *)> visit) {
+ // blocks: back -> front
+ for (Block &block : llvm::reverse(region)) {
+ // ops: back -> front, early-inc so visit() may erase current op safely
+ for (Operation &op : llvm::reverse(block)) {
+ // make sure we first visit inside the region op (so yield op first)
+ // and then move to region op itself
+ for (Region &nested : llvm::reverse(op.getRegions()))
+ walkRegionBackward(nested, visit);
+
+ visit(&op);
+ }
+ }
+}
+
+static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
+ xegpu::DistributeLayoutAttr layout = nullptr;
+ for (OpOperand &use : result.getUses()) {
+ if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
+ if (!layout)
+ layout = tmpLayout;
+ }
+ }
+ return layout;
+}
+
+// For regular operations: First the result layouts are propagated from uses.
+// Then the result layouts are propagated to uses (operands).
+static void propagateResultsToRegularOperands(Operation *op) {
+ if (op->getNumResults() == 0)
+ return;
+
+ OpResult result = op->getResult(0);
+ xegpu::DistributeLayoutAttr resLayout = getLayoutFromUsePoints(result);
+ Type resultType = result.getType();
+
+ // recover layout for tensor Descriptor type, which is a special case since
+ // its layout is not stored as an attribute but encoded in the type itself.
+ // For vector type, we attach the layout as an attribute to op.
+ if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+ auto layout = tensorDescTy.getLayoutAttr();
+ // TODO: remove the layout check. The tensorDescType's layout is treated as
+ // temporary layout, which needs to be set by layout recovery.
+ // allow it now to pass some legacy test case
+ if (!layout) {
+ auto typeWithLayout = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
+ result.setType(typeWithLayout);
+ }
+ }
+
+ xegpu::setTemporaryLayout(result, resLayout);
+
+ for (OpOperand &opr : op->getOpOperands()) {
+ // Layouts are needed for vector type only.
+ xegpu::DistributeLayoutAttr operandLayout =
+ xegpu::inferSourceLayoutFromResult(opr, resLayout);
+ if (!isa<VectorType>(opr.get().getType()))
+ continue;
+
+ xegpu::setTemporaryLayout(opr, operandLayout);
+ }
+}
+
+static void propagateRegionResultsToYieldOperands(
+ mlir::RegionBranchTerminatorOpInterface yieldOp) {
+ if (isa<func::FuncOp>(yieldOp->getParentOp()))
+ return;
+
+ auto regionBranchOp =
+ dyn_cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+ if (!regionBranchOp)
+ return;
+
+ // Gather layouts for each result of the parent region op from external
+ // use points.
+ unsigned numResults = regionBranchOp->getNumResults();
+ if (numResults == 0)
+ return;
+
+ SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
+ for (unsigned i = 0; i < numResults; ++i) {
+ OpResult result = regionBranchOp->getResult(i);
+ resultLayouts[i] = getLayoutFromUsePoints(result);
+ if (resultLayouts[i])
+ xegpu::setTemporaryLayout(result, resultLayouts[i]);
+ }
+
+ // Use getSuccessorOperands to find which operands of the terminator
+ // flow to a successor. This handles index offsets automatically (e.g.,
+ // scf.condition's predicate at operand #0 is excluded).
+ // Pick the first successor to determine the operand range.
+ SmallVector<RegionSuccessor> successors;
+ SmallVector<Attribute> operandAttrs(yieldOp->getNumOperands(), nullptr);
+ yieldOp.getSuccessorRegions(operandAttrs, successors);
+ assert(!successors.empty() && "terminator must have at least one successor");
+
+ OperandRange succOps = yieldOp.getSuccessorOperands(successors.front());
+ unsigned beginIdx = succOps.getBeginOperandIndex();
+ unsigned count = std::min(static_cast<unsigned>(succOps.size()), numResults);
+
+ for (unsigned i = 0; i < count; ++i) {
+ if (!resultLayouts[i])
+ continue;
+ xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i),
+ resultLayouts[i]);
+ }
+}
+
+static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
+ // Iterate all regions of the region op. For each block argument that has a
+ // layout (determined from its use points), trace back to find the
+ // corresponding init operand of the regionOp and set the layout on it.
+ // This works generically for scf.for, scf.while, and other
+ // RegionBranchOpInterface ops.
+ for (Region ®ion : regionOp->getRegions()) {
+ RegionSuccessor regionSuccessor(®ion);
+ for (auto [argIdx, regionArg] : llvm::enumerate(region.getArguments())) {
+ auto layout = getLayoutFromUsePoints(regionArg);
+ if (!layout)
continue;
+
+ // Find all predecessor values that flow into this block argument.
+ SmallVector<Value> predValues;
----------------
Jianhui-Li wrote:
It contains both init_args and iter_args but only init_args are processed and set operand layout
https://github.com/llvm/llvm-project/pull/191947
More information about the Mlir-commits
mailing list