[Mlir-commits] [mlir] [MLIR][XeGPU] TensorDesc Type support generic DistributeLayout instead of Layout (PR #190401)
Jianhui Li
llvmlistbot at llvm.org
Fri Apr 3 13:43:24 PDT 2026
https://github.com/Jianhui-Li created https://github.com/llvm/llvm-project/pull/190401
This PR allows TensorDesc to support slice layout, not just plain layout.
>From 3a6c2fe41fa7953ca42e94e5663231b33052ce00 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 2 Apr 2026 22:42:50 +0000
Subject: [PATCH 1/7] initial implementation
---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 5 +-
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 473 ++++++++++++++++--
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 77 ++-
3 files changed, 499 insertions(+), 56 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 9cf9a8705209b..5f46eab7b74c7 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -183,10 +183,13 @@ setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
const uArch::uArch *uArch);
+DistributeLayoutAttr
+inferSourceLayoutFromResult(OpOperand &operand, DistributeLayoutAttr resLayout);
+
/// Gets the expected layout for a given consumer operand. This will check if
/// the owning operation of the consumer operand is one of the special layout
/// users and determine the expected layout accordingly.
-xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
+DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
} // namespace xegpu
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 55cd6ec04970c..06cd0eaa0059e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -18,16 +18,22 @@
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>
+#define DEBUG_TYPE "xegpu-layout-recovery"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
using namespace mlir;
void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
@@ -80,32 +86,330 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
return out;
}
-// Attach layout attributes to all vector-type operands of operations within
-// the given operation's region. Reports an error if any vector operand lacks
-// a layout attribute.
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
- auto result = rootOp->walk([&](Operation *op) {
- for (OpOperand &operand : op->getOpOperands()) {
- // Layouts are needed for vector type only.
- if (!isa<VectorType>(operand.get().getType()))
- continue;
- // Skip block arguments since they don't have defining ops to attach
- // layout attributes to.
- if (isa<BlockArgument>(operand.get()))
+// Prerequisite for Layout Recovery
+// It relies on the following invariant:
+// 1. there is no layout conflict between different uses of the same definition.
+// 2. each definition has a well-defined layout requirement at its use point.
+// - Every definition must have at least one use that appears after it in
+// topological order.
+// - If a definition has no such use (e.g., a loop result or region output),
+// an explicit convert_layout operation is inserted to create a use.
+// - Only the result of convert_layout is permitted to have no subsequent
+// use.
+
+// The recovery proceeds by scanning the operation in reverse topological order
+// as follows:
+// For regular operations: First the result layouts are propagated from uses.
+// Then the result layouts are propagated to operands.
+//
+// For region operations (e.g., loops):
+// - When backward propagation reaches a region op, it sets the layout of
+// the region op’s results according to use points like regular ops.
+// - Then, the result layouts (such as a loop output) are propagated to
+// their corresponding operands in the yield.
+// - When backward propagation reaches the first operation inside the
+// region, the pass examines the region op’s initialization list,
+// propagating from region arguments to the corresponding initialization
+// operands.
+// - This ensures that layouts are consistently propagated
+// across region boundaries while preserving a single well-defined use for
+// each definition at the region-op level.
+
+// the inner function for recoverTemporaryLayouts is a recursive function
+// the input rootOp is the function operation, which is also a region op.
+// it recursivley process the region op in reverse topological order.
+
+static void walkRegionBackward(Region ®ion,
+ llvm::function_ref<void(Operation *)> visit) {
+ // blocks: back -> front
+ for (Block &block : llvm::reverse(region)) {
+ // ops: back -> front, early-inc so visit() may erase current op safely
+ for (Operation &op : llvm::reverse(block)) {
+ // make sure we first visit inside the region op (so yield op first)
+ // and then move to region op itself
+ for (Region &nested : llvm::reverse(op.getRegions()))
+ walkRegionBackward(nested, visit);
+
+ visit(&op);
+ }
+ }
+}
+
+static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
+ xegpu::DistributeLayoutAttr layout = nullptr;
+ for (OpOperand &use : result.getUses()) {
+ if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
+ // debug print the use and op, and the tmpLayout
+ LLVM_DEBUG({
+ DBGS() << " use: " << use.getOwner()->getName() << use.getOwner();
+ llvm::dbgs() << ", tmpLayout=" << tmpLayout << "\n";
+ });
+ // under debug mode, we want to check all the use points to make sure
+ // there is no conflict, so we do not break here. In release mode, we can
+ // break at the first use
+#ifndef NDEBUG
+ assert(!layout || layout == tmpLayout);
+ layout = tmpLayout;
+#else
+ layout = tmpLayout;
+ break;
+#endif
+ }
+ }
+ return layout;
+}
+
+// For regular operations: First the result layouts are propagated from uses.
+// Then the result layouts are propagated to uses (operands).
+static void propagateResultsToRegularOperands(Operation *op) {
+ LLVM_DEBUG(DBGS() << "propagateResultsToRegularOperands: " << op->getName()
+ << " (" << op->getNumOperands() << " operands, "
+ << op->getNumResults() << " results)\n");
+
+ if (op->getNumResults() == 0) {
+ LLVM_DEBUG(DBGS() << " skipping (no results)\n");
+ return;
+ }
+
+ Value result = op->getResult(0);
+ xegpu::DistributeLayoutAttr resLayout =
+ getLayoutFromUsePoints(op->getResult(0));
+ Type resultType = result.getType();
+
+ // recover layout for tensor Descriptor type, which is a special case since
+ // its layout is not stored as an attribute but encoded in the type itself.
+ // For vector type, we attach the layout as an attribute to op.
+ if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+ auto typeWithLayout = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
+ result.setType(typeWithLayout);
+ }
+
+ for (OpOperand &opr : op->getOpOperands()) {
+ // Layouts are needed for vector type only.
+ xegpu::DistributeLayoutAttr operandLayout =
+ xegpu::inferSourceLayoutFromResult(opr, resLayout);
+ if (!isa<VectorType>(opr.get().getType())) {
+ LLVM_DEBUG(DBGS() << " operand #" << opr.getOperandNumber()
+ << ": skipped (non-vector type: " << opr.get().getType()
+ << ")\n");
+ continue;
+ }
+
+ xegpu::setTemporaryLayout(opr, operandLayout);
+ // debug print op
+ LLVM_DEBUG(DBGS() << "after propagateResultsToRegularOperands op: "
+ << op->getName() << op << " operand #"
+ << opr.getOperandNumber()
+ << ": type=" << opr.get().getType());
+ llvm::dbgs() << ", temp Layout=" << xegpu::getTemporaryLayout(opr);
+ llvm::dbgs() << "\n";
+ }
+}
+
+static void propagateRegionResultsToYieldOperands(
+ mlir::RegionBranchTerminatorOpInterface yieldOp) {
+ LLVM_DEBUG(DBGS() << "propagateRegionResultsToYieldOperands: "
+ << yieldOp->getName() << " (" << yieldOp->getNumOperands()
+ << " operands), parent="
+ << yieldOp->getParentOp()->getName() << "\n");
+
+ if (func::FuncOp func = dyn_cast<func::FuncOp>(yieldOp->getParentOp())) {
+ LLVM_DEBUG(DBGS() << " skipping (parent is FuncOp)\n");
+ return;
+ }
+ llvm::SmallVector<mlir::RegionSuccessor> successors;
+ llvm::SmallVector<mlir::Attribute> operands(yieldOp->getNumOperands(),
+ nullptr);
+ yieldOp.getSuccessorRegions(operands, successors);
+
+ auto regionBranchOp = cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+
+ LLVM_DEBUG(DBGS() << " found " << successors.size() << " successors\n");
+ for (mlir::RegionSuccessor &successor : successors) {
+ // debug print out successorr
+ LLVM_DEBUG({
+ DBGS() << " successor: ";
+ if (successor.isParent()) {
+ DBGS() << "(parent operation)";
+ } else {
+ DBGS() << "region with " << successor.getSuccessor()->getNumArguments()
+ << " arguments";
+ }
+ DBGS() << "\n";
+ });
+ // find out the successor which is the parent region of yieldOp
+ // if (successor.getSuccessor() != yieldOp->getParentRegion()) {
+ // LLVM_DEBUG(DBGS() << " skipping successor (not parent region)\n");
+ // continue;
+ // }
+ if (!successor.isParent())
+ continue;
+ // propagate the layout from region result to yield operands
+ ValueRange successorInputs = regionBranchOp.getSuccessorInputs(successor);
+ LLVM_DEBUG(DBGS() << " propagating " << successorInputs.size()
+ << " region results to yield operands\n");
+ for (unsigned i = 0; i < successorInputs.size(); ++i) {
+ Value regionResult = successorInputs[i];
+
+ // debug print regionResult
+ LLVM_DEBUG({
+ DBGS() << " before propagateRegionResultsToYieldOperands, Region IR:";
+ DBGS() << " region result #" << i
+ << ": type=" << regionResult.getType();
+ llvm::dbgs() << regionResult;
+ llvm::dbgs() << "\n";
+ });
+ // find all the use of region result, and propagate the layout to the
+ // corresponding yield operand for all use of region result, get its
+ // layout from temporary operand layout if any of these use have it
+ xegpu::DistributeLayoutAttr layout = getLayoutFromUsePoints(regionResult);
+
+ // auto layout = xegpu::getDistributeLayoutAttr(regionResult);
+ if (layout == nullptr) {
+ LLVM_DEBUG(DBGS() << " region result #" << i
+ << ": skipped (no layout)\n");
continue;
- auto layout = xegpu::getDistributeLayoutAttr(operand.get());
- if (!layout) {
- op->emitWarning("Could not find layout attribute for operand ")
- << operand.getOperandNumber() << " of operation " << op->getName();
+ }
+ assert(
+ layout &&
+ "region result layout must be defined before propagating to yield");
+
+ if (auto opResult = dyn_cast<OpResult>(regionResult))
+ xegpu::setTemporaryLayout(opResult, layout);
+ xegpu::setTemporaryLayout(yieldOp->getOpOperand(i), layout);
+
+ LLVM_DEBUG({
+ DBGS() << " after propagateRegionResultsToYieldOperands, Region IR:";
+ regionResult.print(llvm::dbgs());
+ if (Operation *defOp = regionResult.getDefiningOp())
+ defOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+ llvm::dbgs() << "\n";
+ });
+ }
+ }
+}
+
+static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
+ LLVM_DEBUG(DBGS() << "propagateRegionArgsToInits: " << regionOp->getName()
+ << " (" << regionOp->getNumOperands() << " operands, "
+ << regionOp->getNumRegions() << " regions)\n");
+ DBGS() << " before propagateRegionArgsToInits, Region IR:";
+ regionOp.print(llvm::dbgs());
+ DBGS() << " complex debug Region IR:";
+ regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+ // Get entry successors (regions that can be entered initially)
+ SmallVector<RegionSuccessor> successors;
+ regionOp.getEntrySuccessorRegions(/*operands=*/ArrayRef<Attribute>(),
+ successors);
+
+ LLVM_DEBUG(DBGS() << " found " << successors.size()
+ << " entry successors\n");
+ // For each possible entry region, get the operands forwarded to it
+ for (RegionSuccessor &successor : successors) {
+ OperandRange initOperands = regionOp.getEntrySuccessorOperands(successor);
+ unsigned beginIdx = initOperands.getBeginOperandIndex();
+ unsigned numArgs = successor.getSuccessor()->getNumArguments();
+ LLVM_DEBUG(DBGS() << " successor region: " << numArgs
+ << " args, initOperands beginIdx=" << beginIdx
+ << ", count=" << initOperands.size() << "\n");
+ // initOperands are the initialization arguments for this successor
+ // iterate the region arguments
+ for (unsigned i = 0; i < numArgs; ++i) {
+ Value regionArg =
+ successor.getSuccessor()->getArgument(i); // region argument
+ auto layout = xegpu::getDistributeLayoutAttr(regionArg);
+ if (layout == nullptr) {
+ LLVM_DEBUG(DBGS() << " region argument #" << i
+ << ": skipped (no layout)\n");
continue;
}
- xegpu::setTemporaryLayout(operand, layout);
+ assert(
+ layout &&
+ "region argument layout must be defined before propagating to init");
+ LLVM_DEBUG(DBGS() << " regionArg #" << i << ": type="
+ << regionArg.getType() << ", layout=" << layout
+ << " -> init operand #" << (beginIdx + i) << "\n");
+ xegpu::setTemporaryLayout(regionOp->getOpOperand(beginIdx + i), layout);
}
- return WalkResult::advance();
+ }
+ DBGS() << " after propagateRegionArgsToInits, Region IR:";
+ regionOp.print(llvm::dbgs());
+}
+
+bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+ LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts START ===\n");
+
+ auto processFunc = [&](Region &body, StringRef funcName) {
+ LLVM_DEBUG(DBGS() << "Processing func: " << funcName << "\n");
+ walkRegionBackward(body, [&](Operation *op) {
+ LLVM_DEBUG(DBGS() << "Visiting op: " << op->getName());
+ if (op->getNumResults() > 0) {
+ LLVM_DEBUG(llvm::dbgs() << " [results: " << op->getNumResults());
+ for (OpResult res : op->getResults()) {
+ auto layout = xegpu::getDistributeLayoutAttr(res);
+ LLVM_DEBUG(llvm::dbgs() << " r#" << res.getResultNumber() << "="
+ << (layout ? layout : nullptr));
+ }
+ LLVM_DEBUG(llvm::dbgs() << "]");
+ }
+ LLVM_DEBUG(llvm::dbgs() << "\n");
+ if (auto regionOp = dyn_cast<mlir::RegionBranchOpInterface>(op)) {
+ // hit the region op after visiting inside region
+ LLVM_DEBUG(DBGS() << " -> dispatching as RegionBranchOp\n");
+ propagateRegionArgsToInits(regionOp);
+ } else if (auto yieldOp =
+ dyn_cast<mlir::RegionBranchTerminatorOpInterface>(op)) {
+ // yield op inside region op
+ LLVM_DEBUG(DBGS() << " -> dispatching as YieldOp\n");
+ propagateRegionResultsToYieldOperands(yieldOp);
+ } else if (!dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
+ // if the op is regular op, calling propagateResultsToRegularOperands
+ LLVM_DEBUG(DBGS() << " -> dispatching as regular op\n");
+ propagateResultsToRegularOperands(op);
+ }
+ });
+ };
+
+ rootOp->walk([&](func::FuncOp func) {
+ processFunc(func.getBody(), func.getSymName());
});
- return !result.wasInterrupted();
+ rootOp->walk([&](gpu::GPUFuncOp func) {
+ processFunc(func.getBody(), func.getName());
+ });
+
+ LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts END ===\n");
+ return true;
}
+// // Attach layout attributes to all vector-type operands of operations within
+// // the given operation's region. Reports an error if any vector operand lacks
+// // a layout attribute.
+// bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+// auto result = rootOp->walk([&](Operation *op) {
+// for (OpOperand &operand : op->getOpOperands()) {
+// // Layouts are needed for vector type only.
+// if (!isa<VectorType>(operand.get().getType()))
+// continue;
+// // Skip block arguments since they don't have defining ops to attach
+// // layout attributes to.
+// if (isa<BlockArgument>(operand.get()))
+// continue;
+// auto layout = xegpu::getDistributeLayoutAttr(operand.get());
+// if (!layout) {
+// op->emitWarning("Could not find layout attribute for operand ")
+// << operand.getOperandNumber() << " of operation " <<
+// op->getName();
+// xegpu::setTemporaryLayout(operand, layout);
+// continue;
+// }
+// }
+// return WalkResult::advance();
+// });
+// return !result.wasInterrupted();
+// }
+
template <typename T, typename>
void xegpu::removeLayoutAttr(const T &operandOrResult) {
Operation *owner = operandOrResult.getOwner();
@@ -1108,99 +1412,178 @@ xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
return std::nullopt;
}
-xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
+xegpu::DistributeLayoutAttr
+xegpu::inferSourceLayoutFromResult(OpOperand &operand,
+ xegpu::DistributeLayoutAttr resLayout) {
Operation *op = operand.getOwner();
unsigned idx = operand.getOperandNumber();
- xegpu::DistributeLayoutAttr resLayout;
- if (op->getNumResults() == 1)
- resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
// For vector::BroadcastOp, infer the source layout from the result layout.
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG(DBGS() << " -> BroadcastOp\n");
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
+ }
auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
- if (!srcTy)
+ if (!srcTy) {
+ LLVM_DEBUG(DBGS() << " source is not VectorType, returning null\n");
return xegpu::DistributeLayoutAttr();
- return xegpu::inferBroadcastSourceLayout(
+ }
+ auto inferred = xegpu::inferBroadcastSourceLayout(
resLayout, broadcast.getResultVectorType().getShape(),
srcTy.getShape());
+ LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
+ return inferred;
}
// For vector::MultiDimReductionOp, infer source layout from result layout
// using reduction dims. Acc operand is expected to have the same layout as
// the result.
if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG(DBGS() << " -> MultiDimReductionOp, operand idx=" << idx
+ << "\n");
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
+ }
if (idx == 0) {
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
- return xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
+ LLVM_DEBUG({
+ DBGS() << " reductionDims=[";
+ llvm::interleaveComma(reductionDims, llvm::dbgs());
+ llvm::dbgs() << "]\n";
+ });
+ auto inferred =
+ xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
+ LLVM_DEBUG(DBGS() << " inferred source layout=" << inferred << "\n");
+ return inferred;
}
- if (idx == 1)
+ if (idx == 1) {
+ LLVM_DEBUG(DBGS() << " acc operand, using resLayout\n");
return resLayout;
+ }
}
if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG(DBGS() << " -> ReductionOp\n");
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
- return xegpu::inferReductionSourceLayout(resLayout);
+ }
+ auto inferred = xegpu::inferReductionSourceLayout(resLayout);
+ LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
+ return inferred;
}
// For vector::BitCastOp, infer source layout from result layout using
// element type bitwidths.
if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG(DBGS() << " -> BitCastOp\n");
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
+ }
int resElemBitWidth =
bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
int srcElemBitWidth =
bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
- return xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
- srcElemBitWidth);
+ LLVM_DEBUG(DBGS() << " resBitWidth=" << resElemBitWidth
+ << ", srcBitWidth=" << srcElemBitWidth << "\n");
+ auto inferred = xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
+ srcElemBitWidth);
+ LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
+ return inferred;
}
// For vector::ShapeCastOp, infer source layout from result layout using
// shapes.
if (auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG({
+ DBGS() << " -> ShapeCastOp: resShape=[";
+ llvm::interleaveComma(shapeCast.getResultVectorType().getShape(),
+ llvm::dbgs());
+ llvm::dbgs() << "], srcShape=[";
+ llvm::interleaveComma(shapeCast.getSourceVectorType().getShape(),
+ llvm::dbgs());
+ llvm::dbgs() << "]\n";
+ });
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
- return xegpu::inferShapeCastSourceLayout(
+ }
+ auto inferred = xegpu::inferShapeCastSourceLayout(
resLayout, shapeCast.getResultVectorType().getShape(),
shapeCast.getSourceVectorType().getShape());
+ LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
+ return inferred;
}
// For vector::InsertStridedSliceOp, infer source layout from result layout.
// Dest vector must have the same layout as the result.
if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG(DBGS() << " -> InsertStridedSliceOp, operand idx=" << idx
+ << "\n");
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
- if (idx == 0)
- return xegpu::inferInsertStridedSliceSourceLayout(
+ }
+ if (idx == 0) {
+ auto inferred = xegpu::inferInsertStridedSliceSourceLayout(
resLayout, insertSlice.getDestVectorType().getShape(),
insertSlice.getSourceVectorType().getShape());
- if (idx == 1)
+ LLVM_DEBUG(DBGS() << " inferred source layout=" << inferred << "\n");
+ return inferred;
+ }
+ if (idx == 1) {
+ LLVM_DEBUG(DBGS() << " dest operand, using resLayout\n");
return resLayout;
+ }
}
// For vector::TransposeOp, infer source layout from result layout using
// permutation.
if (auto transpose = dyn_cast<vector::TransposeOp>(op)) {
- if (!resLayout)
+ LLVM_DEBUG({
+ DBGS() << " -> TransposeOp, perm=[";
+ llvm::interleaveComma(transpose.getPermutation(), llvm::dbgs());
+ llvm::dbgs() << "]\n";
+ });
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
return xegpu::DistributeLayoutAttr();
- return xegpu::inferTransposeSourceLayout(resLayout,
- transpose.getPermutation());
+ }
+ auto inferred = xegpu::inferTransposeSourceLayout(
+ resLayout, transpose.getPermutation());
+ LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
+ return inferred;
}
// For elementwise operations, all operands must have the same layout as the
// result.
if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
+ LLVM_DEBUG(DBGS() << " -> elementwise op, using resLayout="
+ << (resLayout ? resLayout : nullptr) << "\n");
if (!resLayout)
return xegpu::DistributeLayoutAttr();
return resLayout;
}
- // TODO: Handle more cases as needed here.
+ return xegpu::DistributeLayoutAttr();
+}
+
+xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
+ Operation *op = operand.getOwner();
+ xegpu::DistributeLayoutAttr resLayout;
+ if (op->getNumResults() == 1)
+ resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+ auto inferredOperandLayout = inferSourceLayoutFromResult(operand, resLayout);
+ if (inferredOperandLayout)
+ return inferredOperandLayout;
// By default, assume no layout conflict and return the current layout of
// the operand.
- return xegpu::getDistributeLayoutAttr(operand.get());
+ auto fallback = xegpu::getDistributeLayoutAttr(operand);
+ LLVM_DEBUG(DBGS() << " -> fallback (unhandled op " << op->getName()
+ << "), returning operand layout="
+ << (fallback ? fallback : nullptr) << "\n");
+ return fallback;
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 243581b4ce522..a762458105e47 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -23,10 +23,14 @@
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>
+#define DEBUG_TYPE "xegpu-utils"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
using namespace mlir;
/// convert ArrayRef<ValueRange> into SmallVector<Value>
@@ -145,19 +149,31 @@ std::string xegpu::getTemporaryLayoutName(const OpResult result) {
}
xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
- if (!value)
+ LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(Value): type="
+ << value.getType() << "\n");
+ if (!value) {
+ LLVM_DEBUG(DBGS() << " -> null value, returning nullptr\n");
return nullptr;
+ }
if (auto tdescTy =
- dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
- return tdescTy.getLayoutAttr();
+ dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
+ auto layout = tdescTy.getLayoutAttr();
+ LLVM_DEBUG(DBGS() << " -> TensorDescType, layout="
+ << (layout ? layout : nullptr) << "\n");
+ return layout;
+ }
if (auto result = dyn_cast<OpResult>(value)) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
+ LLVM_DEBUG(DBGS() << " OpResult #" << result.getResultNumber() << " from "
+ << defOp->getName() << "\n");
if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
auto layout = anchorOp.getAnchorLayout();
+ LLVM_DEBUG(DBGS() << " -> AnchorLayoutInterface, layout="
+ << (layout ? layout : nullptr) << "\n");
return layout;
}
@@ -165,59 +181,100 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (defOp->hasAttr(layoutName)) {
auto layout =
defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+ LLVM_DEBUG(DBGS() << " -> temporary attr '" << layoutName
+ << "', layout=" << layout << "\n");
return layout;
}
+ LLVM_DEBUG(DBGS() << " -> OpResult: no layout found (checked '"
+ << layoutName << "')\n");
}
if (auto arg = dyn_cast<BlockArgument>(value)) {
auto *parentOp = arg.getOwner()->getParentOp();
+ LLVM_DEBUG(DBGS() << " BlockArgument #" << arg.getArgNumber() << " of "
+ << (parentOp ? parentOp->getName().getStringRef()
+ : StringRef("(null)"))
+ << "\n");
if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
- if (tiedInit)
+ if (tiedInit) {
+ LLVM_DEBUG(DBGS() << " -> LoopLikeOp, recursing into tiedInit "
+ << "operand #" << tiedInit->getOperandNumber()
+ << "\n");
return getDistributeLayoutAttr(tiedInit->get());
+ }
+ LLVM_DEBUG(DBGS() << " -> LoopLikeOp, no tiedInit\n");
}
}
+ LLVM_DEBUG(DBGS() << " -> returning nullptr\n");
return nullptr;
}
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
+ LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(OpOperand): operand #" << idx
+ << " of " << op->getName()
+ << ", type=" << opr.get().getType() << "\n");
if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
if (idx == 0) {
- return dpasOp.getLayoutAAttr();
+ auto layout = dpasOp.getLayoutAAttr();
+ LLVM_DEBUG(DBGS() << " -> DpasOp layoutA="
+ << (layout ? layout : nullptr) << "\n");
+ return layout;
} else if (idx == 1) {
- return dpasOp.getLayoutBAttr();
+ auto layout = dpasOp.getLayoutBAttr();
+ LLVM_DEBUG(DBGS() << " -> DpasOp layoutB="
+ << (layout ? layout : nullptr) << "\n");
+ return layout;
} else if (idx == 2) {
- return dpasOp.getLayoutCdAttr();
+ auto layout = dpasOp.getLayoutCdAttr();
+ LLVM_DEBUG(DBGS() << " -> DpasOp layoutCd="
+ << (layout ? layout : nullptr) << "\n");
+ return layout;
}
}
if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
- return convertOp.getInputLayoutAttr();
+ auto layout = convertOp.getInputLayoutAttr();
+ LLVM_DEBUG(DBGS() << " -> ConvertLayoutOp inputLayout="
+ << (layout ? layout : nullptr) << "\n");
+ return layout;
}
auto layout = anchorOp.getAnchorLayout();
- if (idx == 0)
+ if (idx == 0) {
+ LLVM_DEBUG(DBGS() << " -> AnchorLayoutInterface idx=0, layout="
+ << (layout ? layout : nullptr) << "\n");
return layout;
+ }
// For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
// the layout is valid for the first two operands: value and memref/tdesc.
// For other operations, the layout applies to the first operand only.
if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
op) &&
- (idx < 2))
+ (idx < 2)) {
+ LLVM_DEBUG(DBGS() << " -> Store op idx=" << idx
+ << ", layout=" << (layout ? layout : nullptr) << "\n");
return layout;
+ }
+ LLVM_DEBUG(DBGS() << " -> AnchorLayoutInterface idx=" << idx
+ << " not covered, falling through\n");
}
std::string layoutName = xegpu::getTemporaryLayoutName(opr);
if (op->hasAttr(layoutName)) {
auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+ LLVM_DEBUG(DBGS() << " -> temporary attr '" << layoutName
+ << "', layout=" << layout << "\n");
return layout;
}
+ LLVM_DEBUG(DBGS() << " -> returning nullptr (checked '" << layoutName
+ << "')\n");
return nullptr;
}
>From f77e110d9dc81257b2deeb9cccb20e10bea3739b Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 05:31:42 +0000
Subject: [PATCH 2/7] pass while
---
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 253 +++++++-----------
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 2 +-
2 files changed, 103 insertions(+), 152 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 06cd0eaa0059e..47148870eeaae 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -147,13 +147,8 @@ static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
// under debug mode, we want to check all the use points to make sure
// there is no conflict, so we do not break here. In release mode, we can
// break at the first use
-#ifndef NDEBUG
- assert(!layout || layout == tmpLayout);
- layout = tmpLayout;
-#else
- layout = tmpLayout;
- break;
-#endif
+ if (!layout)
+ layout = tmpLayout;
}
}
return layout;
@@ -215,127 +210,118 @@ static void propagateRegionResultsToYieldOperands(
<< " operands), parent="
<< yieldOp->getParentOp()->getName() << "\n");
- if (func::FuncOp func = dyn_cast<func::FuncOp>(yieldOp->getParentOp())) {
+ if (isa<func::FuncOp>(yieldOp->getParentOp())) {
LLVM_DEBUG(DBGS() << " skipping (parent is FuncOp)\n");
return;
}
- llvm::SmallVector<mlir::RegionSuccessor> successors;
- llvm::SmallVector<mlir::Attribute> operands(yieldOp->getNumOperands(),
- nullptr);
- yieldOp.getSuccessorRegions(operands, successors);
- auto regionBranchOp = cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+ auto regionBranchOp =
+ dyn_cast<RegionBranchOpInterface>(yieldOp->getParentOp());
+ if (!regionBranchOp) {
+ LLVM_DEBUG(DBGS() << " skipping (parent is not RegionBranchOp)\n");
+ return;
+ }
- LLVM_DEBUG(DBGS() << " found " << successors.size() << " successors\n");
- for (mlir::RegionSuccessor &successor : successors) {
- // debug print out successorr
- LLVM_DEBUG({
- DBGS() << " successor: ";
- if (successor.isParent()) {
- DBGS() << "(parent operation)";
- } else {
- DBGS() << "region with " << successor.getSuccessor()->getNumArguments()
- << " arguments";
- }
- DBGS() << "\n";
- });
- // find out the successor which is the parent region of yieldOp
- // if (successor.getSuccessor() != yieldOp->getParentRegion()) {
- // LLVM_DEBUG(DBGS() << " skipping successor (not parent region)\n");
- // continue;
- // }
- if (!successor.isParent())
- continue;
- // propagate the layout from region result to yield operands
- ValueRange successorInputs = regionBranchOp.getSuccessorInputs(successor);
- LLVM_DEBUG(DBGS() << " propagating " << successorInputs.size()
- << " region results to yield operands\n");
- for (unsigned i = 0; i < successorInputs.size(); ++i) {
- Value regionResult = successorInputs[i];
-
- // debug print regionResult
- LLVM_DEBUG({
- DBGS() << " before propagateRegionResultsToYieldOperands, Region IR:";
- DBGS() << " region result #" << i
- << ": type=" << regionResult.getType();
- llvm::dbgs() << regionResult;
- llvm::dbgs() << "\n";
- });
- // find all the use of region result, and propagate the layout to the
- // corresponding yield operand for all use of region result, get its
- // layout from temporary operand layout if any of these use have it
- xegpu::DistributeLayoutAttr layout = getLayoutFromUsePoints(regionResult);
-
- // auto layout = xegpu::getDistributeLayoutAttr(regionResult);
- if (layout == nullptr) {
- LLVM_DEBUG(DBGS() << " region result #" << i
- << ": skipped (no layout)\n");
- continue;
- }
- assert(
- layout &&
- "region result layout must be defined before propagating to yield");
+ // Gather layouts for each result of the parent region op from external
+ // use points.
+ unsigned numResults = regionBranchOp->getNumResults();
+ LLVM_DEBUG(DBGS() << " parent op has " << numResults << " results\n");
+
+ SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
+ for (unsigned i = 0; i < numResults; ++i) {
+ OpResult result = regionBranchOp->getResult(i);
+ resultLayouts[i] = getLayoutFromUsePoints(result);
+ if (resultLayouts[i]) {
+ LLVM_DEBUG(DBGS() << " result #" << i << ": type=" << result.getType()
+ << ", layout=" << resultLayouts[i] << "\n");
+ xegpu::setTemporaryLayout(result, resultLayouts[i]);
+ } else {
+ LLVM_DEBUG(DBGS() << " result #" << i
+ << ": skipped (no layout from use points)\n");
+ }
+ }
- if (auto opResult = dyn_cast<OpResult>(regionResult))
- xegpu::setTemporaryLayout(opResult, layout);
- xegpu::setTemporaryLayout(yieldOp->getOpOperand(i), layout);
+ // Use getSuccessorOperands to find which operands of the terminator
+ // flow to a successor. This handles index offsets automatically (e.g.,
+ // scf.condition's predicate at operand #0 is excluded).
+ // Pick the first successor to determine the operand range.
+ SmallVector<RegionSuccessor> successors;
+ SmallVector<Attribute> operandAttrs(yieldOp->getNumOperands(), nullptr);
+ yieldOp.getSuccessorRegions(operandAttrs, successors);
+ assert(!successors.empty() && "terminator must have at least one successor");
- LLVM_DEBUG({
- DBGS() << " after propagateRegionResultsToYieldOperands, Region IR:";
- regionResult.print(llvm::dbgs());
- if (Operation *defOp = regionResult.getDefiningOp())
- defOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
- llvm::dbgs() << "\n";
- });
- }
+ OperandRange succOps = yieldOp.getSuccessorOperands(successors.front());
+ unsigned beginIdx = succOps.getBeginOperandIndex();
+ unsigned count = std::min(static_cast<unsigned>(succOps.size()), numResults);
+
+ LLVM_DEBUG(DBGS() << " " << count << " successor operands starting at index "
+ << beginIdx << "\n");
+
+ for (unsigned i = 0; i < count; ++i) {
+ if (!resultLayouts[i])
+ continue;
+ LLVM_DEBUG(DBGS() << " -> setting layout on operand #" << (beginIdx + i)
+ << "\n");
+ xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i),
+ resultLayouts[i]);
}
+
+ LLVM_DEBUG({
+ DBGS() << " after propagateRegionResultsToYieldOperands:\n";
+ yieldOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+ llvm::dbgs() << "\n";
+ });
}
static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
LLVM_DEBUG(DBGS() << "propagateRegionArgsToInits: " << regionOp->getName()
<< " (" << regionOp->getNumOperands() << " operands, "
<< regionOp->getNumRegions() << " regions)\n");
- DBGS() << " before propagateRegionArgsToInits, Region IR:";
- regionOp.print(llvm::dbgs());
- DBGS() << " complex debug Region IR:";
- regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
- // Get entry successors (regions that can be entered initially)
- SmallVector<RegionSuccessor> successors;
- regionOp.getEntrySuccessorRegions(/*operands=*/ArrayRef<Attribute>(),
- successors);
-
- LLVM_DEBUG(DBGS() << " found " << successors.size()
- << " entry successors\n");
- // For each possible entry region, get the operands forwarded to it
- for (RegionSuccessor &successor : successors) {
- OperandRange initOperands = regionOp.getEntrySuccessorOperands(successor);
- unsigned beginIdx = initOperands.getBeginOperandIndex();
- unsigned numArgs = successor.getSuccessor()->getNumArguments();
- LLVM_DEBUG(DBGS() << " successor region: " << numArgs
- << " args, initOperands beginIdx=" << beginIdx
- << ", count=" << initOperands.size() << "\n");
- // initOperands are the initialization arguments for this successor
- // iterate the region arguments
- for (unsigned i = 0; i < numArgs; ++i) {
- Value regionArg =
- successor.getSuccessor()->getArgument(i); // region argument
- auto layout = xegpu::getDistributeLayoutAttr(regionArg);
- if (layout == nullptr) {
- LLVM_DEBUG(DBGS() << " region argument #" << i
- << ": skipped (no layout)\n");
+ LLVM_DEBUG({
+ DBGS() << " before propagateRegionArgsToInits, Region IR:\n";
+ regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+ llvm::dbgs() << "\n";
+ });
+
+ // Iterate all regions of the region op. For each block argument that has a
+ // layout (determined from its use points), trace back to find the
+ // corresponding init operand of the regionOp and set the layout on it.
+ // This works generically for scf.for, scf.while, and other
+ // RegionBranchOpInterface ops.
+ for (Region ®ion : regionOp->getRegions()) {
+ RegionSuccessor regionSuccessor(®ion);
+ for (auto [argIdx, regionArg] : llvm::enumerate(region.getArguments())) {
+ auto layout = getLayoutFromUsePoints(regionArg);
+ if (!layout) {
+ LLVM_DEBUG(DBGS() << " region #" << region.getRegionNumber()
+ << " arg #" << argIdx << ": skipped (no layout)\n");
continue;
}
- assert(
- layout &&
- "region argument layout must be defined before propagating to init");
- LLVM_DEBUG(DBGS() << " regionArg #" << i << ": type="
- << regionArg.getType() << ", layout=" << layout
- << " -> init operand #" << (beginIdx + i) << "\n");
- xegpu::setTemporaryLayout(regionOp->getOpOperand(beginIdx + i), layout);
+ LLVM_DEBUG(DBGS() << " region #" << region.getRegionNumber() << " arg #"
+ << argIdx << ": type=" << regionArg.getType()
+ << ", layout=" << layout << "\n");
+
+ // Find all predecessor values that flow into this block argument.
+ SmallVector<Value> predValues;
+ regionOp.getPredecessorValues(regionSuccessor, argIdx, predValues);
+ for (Value predVal : predValues) {
+ // Match predecessor value to an operand of the regionOp.
+ for (OpOperand &operand : regionOp->getOpOperands()) {
+ if (operand.get() == predVal) {
+ LLVM_DEBUG(DBGS() << " -> setting layout on init operand #"
+ << operand.getOperandNumber() << "\n");
+ xegpu::setTemporaryLayout(operand, layout);
+ }
+ }
+ }
}
}
- DBGS() << " after propagateRegionArgsToInits, Region IR:";
- regionOp.print(llvm::dbgs());
+
+ LLVM_DEBUG({
+ DBGS() << " after propagateRegionArgsToInits, Region IR:\n";
+ regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+ llvm::dbgs() << "\n";
+ });
}
bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
@@ -345,16 +331,6 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
LLVM_DEBUG(DBGS() << "Processing func: " << funcName << "\n");
walkRegionBackward(body, [&](Operation *op) {
LLVM_DEBUG(DBGS() << "Visiting op: " << op->getName());
- if (op->getNumResults() > 0) {
- LLVM_DEBUG(llvm::dbgs() << " [results: " << op->getNumResults());
- for (OpResult res : op->getResults()) {
- auto layout = xegpu::getDistributeLayoutAttr(res);
- LLVM_DEBUG(llvm::dbgs() << " r#" << res.getResultNumber() << "="
- << (layout ? layout : nullptr));
- }
- LLVM_DEBUG(llvm::dbgs() << "]");
- }
- LLVM_DEBUG(llvm::dbgs() << "\n");
if (auto regionOp = dyn_cast<mlir::RegionBranchOpInterface>(op)) {
// hit the region op after visiting inside region
LLVM_DEBUG(DBGS() << " -> dispatching as RegionBranchOp\n");
@@ -1415,16 +1391,16 @@ xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
xegpu::DistributeLayoutAttr
xegpu::inferSourceLayoutFromResult(OpOperand &operand,
xegpu::DistributeLayoutAttr resLayout) {
+ if (!resLayout) {
+ LLVM_DEBUG(DBGS() << "no resLayout, returning null\n");
+ return xegpu::DistributeLayoutAttr();
+ }
Operation *op = operand.getOwner();
unsigned idx = operand.getOperandNumber();
// For vector::BroadcastOp, infer the source layout from the result layout.
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
LLVM_DEBUG(DBGS() << " -> BroadcastOp\n");
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
if (!srcTy) {
LLVM_DEBUG(DBGS() << " source is not VectorType, returning null\n");
@@ -1443,10 +1419,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
LLVM_DEBUG(DBGS() << " -> MultiDimReductionOp, operand idx=" << idx
<< "\n");
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
if (idx == 0) {
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
LLVM_DEBUG({
@@ -1467,10 +1439,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
LLVM_DEBUG(DBGS() << " -> ReductionOp\n");
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
auto inferred = xegpu::inferReductionSourceLayout(resLayout);
LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
return inferred;
@@ -1480,10 +1448,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
// element type bitwidths.
if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
LLVM_DEBUG(DBGS() << " -> BitCastOp\n");
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
int resElemBitWidth =
bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
int srcElemBitWidth =
@@ -1508,10 +1472,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
llvm::dbgs());
llvm::dbgs() << "]\n";
});
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
auto inferred = xegpu::inferShapeCastSourceLayout(
resLayout, shapeCast.getResultVectorType().getShape(),
shapeCast.getSourceVectorType().getShape());
@@ -1524,10 +1484,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
LLVM_DEBUG(DBGS() << " -> InsertStridedSliceOp, operand idx=" << idx
<< "\n");
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
if (idx == 0) {
auto inferred = xegpu::inferInsertStridedSliceSourceLayout(
resLayout, insertSlice.getDestVectorType().getShape(),
@@ -1549,10 +1505,6 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
llvm::interleaveComma(transpose.getPermutation(), llvm::dbgs());
llvm::dbgs() << "]\n";
});
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << " no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
auto inferred = xegpu::inferTransposeSourceLayout(
resLayout, transpose.getPermutation());
LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
@@ -1564,8 +1516,7 @@ xegpu::inferSourceLayoutFromResult(OpOperand &operand,
if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
LLVM_DEBUG(DBGS() << " -> elementwise op, using resLayout="
<< (resLayout ? resLayout : nullptr) << "\n");
- if (!resLayout)
- return xegpu::DistributeLayoutAttr();
+
return resLayout;
}
return xegpu::DistributeLayoutAttr();
@@ -1581,7 +1532,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
return inferredOperandLayout;
// By default, assume no layout conflict and return the current layout of
// the operand.
- auto fallback = xegpu::getDistributeLayoutAttr(operand);
+ auto fallback = xegpu::getDistributeLayoutAttr(operand.get());
LLVM_DEBUG(DBGS() << " -> fallback (unhandled op " << op->getName()
<< "), returning operand layout="
<< (fallback ? fallback : nullptr) << "\n");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 4c30dacae8850..f0ff771f4cbc4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1338,7 +1338,7 @@ LogicalResult ResolveLayoutConflicts::run() {
// as anchor op for the reduction op's layout.
if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
for (OpResult result : op->getResults()) {
- if (result.getType().isIntOrFloat()) {
+ if (result.getType().isIntOrFloat() || result.use_empty()) {
auto res = assignResultLayout(result);
if (failed(res)) {
DBGS() << "Failed to resolve vector consumer for multi-reduction "
>From 27cc56acf41eb3380d3195fca5a9215b4414a413 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 18:00:50 +0000
Subject: [PATCH 3/7] adding support for DistributeLayoutAttr in TensorDesc
instead of just LayoutAttr
---
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 6 +++---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 5 +++--
.../XeGPU/Transforms/XeGPUBlocking.cpp | 13 ++++++------
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 21 +++++++++++++------
.../Transforms/XeGPUPeepHoleOptimizer.cpp | 11 +++++++---
.../Transforms/XeGPUSubgroupDistribute.cpp | 9 ++++----
.../Transforms/XeGPUWgToSgDistribute.cpp | 2 +-
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 4 ++--
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 13 ++++++------
9 files changed, 49 insertions(+), 35 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 7e142b20c0894..b13f5a9f2c9d9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -82,7 +82,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
static-dim-list ::= decimal-literal `x` decimal-literal
attr-list = (, encoding-attr)? (, layout-attr)?
enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
- layout-attr = (, layout `<`sg_layout = value, sg_data = value, inst_data = value, lane_layout = value, lane_data = value, order = value`>`)?
+ layout-attr = DistributeLayoutAttr
```
Examples:
@@ -158,8 +158,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return llvm::dyn_cast_if_present<T>(getEncoding());
}
- LayoutAttr getLayoutAttr() const {
- return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+ DistributeLayoutAttr getLayoutAttr() const {
+ return llvm::dyn_cast_if_present<DistributeLayoutAttr>(getLayout());
}
xegpu::MemorySpace getMemorySpace() const {
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 0aa2cd45088f3..1b594f17e15ec 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -219,10 +219,11 @@ void setTemporaryLayout(const T &operandOrResult,
/// Helper function to check if the layout is packed. Layout is packed if it is
/// 2D and lane_data[0] != 1 (data packed from col dimension).
/// TODO: Move to target info.
-bool requirePacked(const LayoutAttr layout);
+bool requirePacked(const DistributeLayoutAttr layout);
/// Helper function to check if the layout requires a transpose effect.
-bool requireTranspose(const LayoutAttr layout, const uArch::uArch *uArch);
+bool requireTranspose(const DistributeLayoutAttr layout,
+ const uArch::uArch *uArch);
// Check if dst shape is an expansion of src shape by inserting unit dimensions.
bool matchUnitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 1ee0bc6ad9507..ef6a494b76638 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -270,12 +270,11 @@ void XeGPUBlockingPass::runOnOperation() {
}
auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
- xegpu::LayoutAttr layout) {
+ xegpu::DistributeLayoutAttr layout) {
int count = 1;
SmallVector<int64_t> tileShape(shape);
- if (layout && layout.getInstData()) {
- DenseI32ArrayAttr instData = layout.getInstData();
- tileShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
+ if (layout && !layout.getEffectiveInstDataAsInt().empty()) {
+ tileShape = layout.getEffectiveInstDataAsInt();
count = computeProduct(shape) / computeProduct(tileShape);
}
return std::make_pair(tileShape, count);
@@ -308,7 +307,7 @@ void XeGPUBlockingPass::runOnOperation() {
Type elemTy = type.getElementType();
ArrayRef<int64_t> shape = type.getShape();
- xegpu::LayoutAttr layout = type.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
if (layout && layout.isForWorkgroup())
return failure();
@@ -348,9 +347,9 @@ void XeGPUBlockingPass::runOnOperation() {
if (chunkSize > 1) {
int64_t blockedChunkSize = chunkSize;
- auto instData = tdescTy.getLayoutAttr().getInstData();
+ auto instData = tdescTy.getLayoutAttr().getEffectiveInstDataAsInt();
if (!instData.empty())
- blockedChunkSize = instData.asArrayRef().back();
+ blockedChunkSize = instData.back();
// To create a new attribute with a different chunk_size:
auto newEncoding = xegpu::ScatterTensorDescAttr::get(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 47148870eeaae..535239e869af1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -141,7 +141,8 @@ static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
// debug print the use and op, and the tmpLayout
LLVM_DEBUG({
- DBGS() << " use: " << use.getOwner()->getName() << use.getOwner();
+ DBGS() << "getLayoutFromUsePoints use: " << use.getOwner()->getName()
+ << use.getOwner();
llvm::dbgs() << ", tmpLayout=" << tmpLayout << "\n";
});
// under debug mode, we want to check all the use points to make sure
@@ -175,10 +176,16 @@ static void propagateResultsToRegularOperands(Operation *op) {
// its layout is not stored as an attribute but encoded in the type itself.
// For vector type, we attach the layout as an attribute to op.
if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
- auto typeWithLayout = xegpu::TensorDescType::get(
- tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
- result.setType(typeWithLayout);
+ auto layout = tensorDescTy.getLayoutAttr();
+ // TODO: remove the layout check. The tensorDescType's layout is treated as
+ // temporary layout, which needs to be set by layout recovery.
+ // allow it now to pass some legacy test case
+ if (!layout) {
+ auto typeWithLayout = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
+ result.setType(typeWithLayout);
+ }
}
for (OpOperand &opr : op->getOpOperands()) {
@@ -226,6 +233,8 @@ static void propagateRegionResultsToYieldOperands(
// use points.
unsigned numResults = regionBranchOp->getNumResults();
LLVM_DEBUG(DBGS() << " parent op has " << numResults << " results\n");
+ if (numResults == 0)
+ return;
SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
for (unsigned i = 0; i < numResults; ++i) {
@@ -303,7 +312,7 @@ static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
// Find all predecessor values that flow into this block argument.
SmallVector<Value> predValues;
- regionOp.getPredecessorValues(regionSuccessor, argIdx, predValues);
+ regionOp.getPredecessorValues(regionSuccessor, argIdx - 1, predValues);
for (Value predVal : predValues) {
// Match predecessor value to an operand of the regionOp.
for (OpOperand &operand : regionOp->getOpOperands()) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 0ece695aed512..9288ba9a0cb56 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -145,10 +145,15 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
return tdescType;
SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
+ auto ctx = tdescType.getContext();
+ auto origLayout = tdescType.getLayoutAttr();
+ SmallVector<int32_t> laneLayoutI32(
+ origLayout.getEffectiveLaneLayoutAsInt().begin(),
+ origLayout.getEffectiveLaneLayoutAsInt().end());
xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
- tdescType.getContext(), tdescType.getLayoutAttr().getLaneLayout(),
- DenseI32ArrayAttr::get(tdescType.getContext(), {1, 1}),
- tdescType.getLayoutAttr().getOrder());
+ ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
+ /*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
+ /*order=*/origLayout.getOrder());
// Array length can not be larger than 1 for transpose case.
return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
tdescType.getBoundaryCheck(),
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ecdf253d68182..d8ce24ddd5cb0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -256,7 +256,7 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
unsigned operandIdx = operand->getOperandNumber();
- xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = descOp.getType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks layout attribute");
@@ -342,7 +342,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
SmallVector<Type> offsetTypes = llvm::map_to_vector(
offsetsAsValues, [](Value v) { return v.getType(); });
xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
- xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
storeOp, "the source tensor descriptor lacks layout attribute");
@@ -474,7 +474,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
offsetsAsValues, [](Value v) { return v.getType(); });
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
- xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
loadOp, "the source tensor descriptor lacks layout attribute");
@@ -709,7 +709,8 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
SmallVector<Type> offsetTypes = llvm::map_to_vector(
offsetsAsValues, [](Value v) { return v.getType(); });
- xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout =
+ prefetchOp.getTensorDescType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
prefetchOp, "the source tensor descriptor lacks layout attribute");
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0aead9172858f..e47224bbe755c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1647,7 +1647,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
converter.addConversion(
[&](xegpu::TensorDescType type,
SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
- xegpu::LayoutAttr layout = type.getLayoutAttr();
+ xegpu::DistributeLayoutAttr layout = type.getLayoutAttr();
// Only convert WG-level tensor descs. SG-level or layout-less types
// are already legal and should pass through unchanged.
if (!layout || !layout.isForWorkgroup())
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a762458105e47..55cf47e38dfd0 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -936,7 +936,7 @@ template int
xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
ArrayRef<unsigned> candidateMultiples);
-bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
+bool xegpu::requirePacked(const xegpu::DistributeLayoutAttr layout) {
if (!layout)
return false;
auto laneData = layout.getEffectiveLaneDataAsInt();
@@ -945,7 +945,7 @@ bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
return laneData[0] != 1;
}
-bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
+bool xegpu::requireTranspose(const xegpu::DistributeLayoutAttr layout,
const xegpu::uArch::uArch *uArch) {
// Return false for unsupported targets.
// TODO: Add more support or move to target info.
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 0d10ab7c74da6..4760016bdcea4 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -106,10 +106,9 @@ struct TestXeGPUUnrollingPatterns
}
if (auto layout = tdescTy.getLayoutAttr()) {
- auto inst_data = layout.getInstData();
- if (inst_data && layout.isForSubgroup())
- return SmallVector<int64_t>(inst_data.asArrayRef().begin(),
- inst_data.asArrayRef().end());
+ auto inst_data = layout.getEffectiveInstDataAsInt();
+ if (!inst_data.empty() && layout.isForSubgroup())
+ return SmallVector<int64_t>(inst_data.begin(), inst_data.end());
}
}
@@ -138,9 +137,9 @@ struct TestXeGPUUnrollingPatterns
if (chunkSize > 1) {
int64_t blockedChunkSize = chunkSize;
- auto instData = layout.getInstData();
+ auto instData = layout.getEffectiveInstDataAsInt();
if (!instData.empty())
- blockedChunkSize = instData.asArrayRef().back();
+ blockedChunkSize = instData.back();
// To create a new attribute with a different chunk_size:
auto newEncoding = xegpu::ScatterTensorDescAttr::get(
@@ -150,7 +149,7 @@ struct TestXeGPUUnrollingPatterns
}
}
if (layout) {
- if (layout.getLaneLayout() == nullptr)
+ if (layout.getEffectiveLaneLayoutAsInt().empty())
layout = xegpu::LayoutAttr();
else
layout = layout.dropInstData();
>From 0690c6cc01e121b137c1056e62d22ff207a82777 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:02:43 +0000
Subject: [PATCH 4/7] fix bugs
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 +-
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 6 +++
.../Transforms/XeGPUPeepHoleOptimizer.cpp | 19 ++++++++--
.../Transforms/XeGPUSubgroupDistribute.cpp | 38 ++++++++++++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 19 +---------
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 4 +-
6 files changed, 64 insertions(+), 24 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 950371e17255f..64c56b5adf5d7 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -1318,7 +1318,7 @@ mlir::Type TensorDescType::parse(AsmParser &parser) {
mlir::Attribute attr;
ParseResult res = parser.parseAttribute(attr);
if (mlir::succeeded(res)) {
- if (mlir::isa<LayoutAttr>(attr)) {
+ if (mlir::isa<DistributeLayoutAttr>(attr)) {
layout = attr;
continue;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 535239e869af1..33c9086566d3c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -365,6 +365,12 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
});
LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts END ===\n");
+ // print the root op after
+ LLVM_DEBUG({
+ DBGS() << "After recoverTemporaryLayouts, IR:\n";
+ rootOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
+ llvm::dbgs() << "\n";
+ });
return true;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index 9288ba9a0cb56..c43eaba5b3ee6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -28,6 +28,7 @@
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
#include <optional>
namespace mlir {
@@ -147,13 +148,25 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
auto ctx = tdescType.getContext();
auto origLayout = tdescType.getLayoutAttr();
- SmallVector<int32_t> laneLayoutI32(
- origLayout.getEffectiveLaneLayoutAsInt().begin(),
- origLayout.getEffectiveLaneLayoutAsInt().end());
+ auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
+ SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
+ laneLayoutI64.end());
+ LLVM_DEBUG({
+ DBGS() << "tryOptimize: origLayout=" << origLayout << "\n";
+ DBGS() << " laneLayoutI32=[";
+ llvm::interleaveComma(laneLayoutI32, llvm::dbgs());
+ llvm::dbgs() << "], laneData=[1, 1]";
+ if (origLayout.getOrder())
+ llvm::dbgs() << ", order=" << origLayout.getOrder();
+ llvm::dbgs() << "\n";
+ DBGS() << " supportedShape=[" << supportedHeight << ", " << supportedWidth
+ << "], newElemTy=" << newElemTy << ", arrayLen=" << arrayLen << "\n";
+ });
xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
/*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
/*order=*/origLayout.getOrder());
+ LLVM_DEBUG(DBGS() << " newLayout=" << newLayout << "\n");
// Array length can not be larger than 1 for transpose case.
return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
tdescType.getBoundaryCheck(),
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d8ce24ddd5cb0..27cf788933f18 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -800,10 +800,17 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
+ LLVM_DEBUG(DBGS() << "StoreDistribution: attempting to match\n");
Operation *lastNode = warpOp.getTerminator()->getPrevNode();
auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
- if (!storeScatterOp)
+ if (!storeScatterOp) {
+ LLVM_DEBUG(
+ DBGS()
+ << "StoreDistribution: last node is not StoreScatterOp, skipping\n");
return failure();
+ }
+ LLVM_DEBUG(DBGS() << "StoreDistribution: matched StoreScatterOp: "
+ << *storeScatterOp << "\n");
auto offsets = storeScatterOp.getOffsets();
if (!offsets || !isa<VectorType>(offsets.getType()))
return rewriter.notifyMatchFailure(
@@ -811,10 +818,15 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
+ LLVM_DEBUG(DBGS() << "StoreDistribution: offsetsTy=" << offsetsTy
+ << ", maskTy=" << maskTy << ", storeVecTy=" << storeVecTy
+ << "\n");
// Add handling for leading unit dimensions support
int chunkSize = storeScatterOp.getChunkSize().value_or(1);
int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
+ LLVM_DEBUG(DBGS() << "StoreDistribution: chunkSize=" << chunkSize
+ << ", effectiveVecRank=" << effectiveVecRank << "\n");
// Check that all leading dimensions are unit dimensions
for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
@@ -831,6 +843,24 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
auto layoutMask =
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
+ LLVM_DEBUG({
+ DBGS() << "StoreDistribution: layoutPayload=";
+ if (layoutPayload)
+ DBGS() << layoutPayload;
+ else
+ DBGS() << "(null)";
+ DBGS() << ", layoutOffsets=";
+ if (layoutOffsets)
+ DBGS() << layoutOffsets;
+ else
+ DBGS() << "(null)";
+ DBGS() << ", layoutMask=";
+ if (layoutMask)
+ DBGS() << layoutMask;
+ else
+ DBGS() << "(null)";
+ DBGS() << "\n";
+ });
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
@@ -849,6 +879,9 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
+ LLVM_DEBUG(DBGS() << "StoreDistribution: distPayloadTy=" << distPayloadTy
+ << ", distOffsetsTy=" << distOffsetsTy
+ << ", distMaskTy=" << distMaskTy << "\n");
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
@@ -885,7 +918,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
+ LLVM_DEBUG(DBGS() << "StoreDistribution: created new op: " << newOp
+ << "\n");
rewriter.eraseOp(storeScatterOp);
+ LLVM_DEBUG(DBGS() << "StoreDistribution: done\n");
return success();
}
};
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 842c2375dd31d..0d1bfd5480aa2 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -473,22 +473,6 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_transpose
-// CHECK: %[[SRC:.*]] = "some_op"()
-// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
-// CHECK-NEXT: %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
-// CHECK-NEXT: gpu.return
-gpu.func @vector_transpose() {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
- : () -> (vector<16x2xf32>)
- %transpose = vector.transpose %cst, [1, 0]
- {
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x2xf32> to vector<2x16xf32>
- gpu.return
-}
// CHECK-LABEL: gpu.func @vector_bitcast
// CHECK: %[[SRC:.*]] = "some_op"()
@@ -1092,7 +1076,8 @@ gpu.module @xevm_module {
gpu.func @vector_broadcast_2d_to_2d_noop(%laneid: index) {
%0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x1xf16>
%1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
- "some_use"(%1) : (vector<16x16xf16>) -> ()
+ %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
+ "some_use"(%2) : (vector<16x16xf16>) -> ()
gpu.return
}
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 9ca424374335f..61b8046bd04e5 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -257,7 +257,7 @@ gpu.module @test_kernel {
// -----
#l = #xegpu.layout<inst_data = [16, 16]>
-#r = #xegpu.layout<inst_data = [16]>
+#r = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [0]>
gpu.module @test_kernel {
gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<64xf32>
@@ -277,7 +277,7 @@ gpu.module @test_kernel {
// -----
#l = #xegpu.layout<inst_data = [16, 16]>
-#r = #xegpu.layout<inst_data = [16]>
+#r = #xegpu.slice<#xegpu.layout<inst_data = [16, 16]>, dims = [1]>
gpu.module @test_kernel {
gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%c1 = arith.constant 1 : index
>From ac36ceaccbd9bff10bf933ffef9b0b0d1e557cdc Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:24:35 +0000
Subject: [PATCH 5/7] separate recover temporary layout out to another PR
---
.../XeGPU/Transforms/XeGPULayoutImpl.h | 5 +-
.../XeGPU/Transforms/XeGPULayoutImpl.cpp | 457 +++---------------
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 2 +-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 19 +-
4 files changed, 73 insertions(+), 410 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
index 5f46eab7b74c7..9cf9a8705209b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@@ -183,13 +183,10 @@ setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
const uArch::uArch *uArch);
-DistributeLayoutAttr
-inferSourceLayoutFromResult(OpOperand &operand, DistributeLayoutAttr resLayout);
-
/// Gets the expected layout for a given consumer operand. This will check if
/// the owning operation of the consumer operand is one of the special layout
/// users and determine the expected layout accordingly.
-DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
+xegpu::DistributeLayoutAttr getConsumerLayoutAt(OpOperand &operand);
} // namespace xegpu
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 33c9086566d3c..55cd6ec04970c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -18,22 +18,16 @@
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/ValueRange.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>
-#define DEBUG_TYPE "xegpu-layout-recovery"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
using namespace mlir;
void xegpu::recoverTemporaryLayoutsDeprecated(Operation *op) {
@@ -86,321 +80,32 @@ xegpu::dropInstDataOnAttrs(ArrayRef<NamedAttribute> attrs) {
return out;
}
-// Prerequisite for Layout Recovery
-// It relies on the following invariant:
-// 1. there is no layout conflict between different uses of the same definition.
-// 2. each definition has a well-defined layout requirement at its use point.
-// - Every definition must have at least one use that appears after it in
-// topological order.
-// - If a definition has no such use (e.g., a loop result or region output),
-// an explicit convert_layout operation is inserted to create a use.
-// - Only the result of convert_layout is permitted to have no subsequent
-// use.
-
-// The recovery proceeds by scanning the operation in reverse topological order
-// as follows:
-// For regular operations: First the result layouts are propagated from uses.
-// Then the result layouts are propagated to operands.
-//
-// For region operations (e.g., loops):
-// - When backward propagation reaches a region op, it sets the layout of
-// the region op’s results according to use points like regular ops.
-// - Then, the result layouts (such as a loop output) are propagated to
-// their corresponding operands in the yield.
-// - When backward propagation reaches the first operation inside the
-// region, the pass examines the region op’s initialization list,
-// propagating from region arguments to the corresponding initialization
-// operands.
-// - This ensures that layouts are consistently propagated
-// across region boundaries while preserving a single well-defined use for
-// each definition at the region-op level.
-
-// the inner function for recoverTemporaryLayouts is a recursive function
-// the input rootOp is the function operation, which is also a region op.
-// it recursivley process the region op in reverse topological order.
-
-static void walkRegionBackward(Region ®ion,
- llvm::function_ref<void(Operation *)> visit) {
- // blocks: back -> front
- for (Block &block : llvm::reverse(region)) {
- // ops: back -> front, early-inc so visit() may erase current op safely
- for (Operation &op : llvm::reverse(block)) {
- // make sure we first visit inside the region op (so yield op first)
- // and then move to region op itself
- for (Region &nested : llvm::reverse(op.getRegions()))
- walkRegionBackward(nested, visit);
-
- visit(&op);
- }
- }
-}
-
-static xegpu::DistributeLayoutAttr getLayoutFromUsePoints(Value result) {
- xegpu::DistributeLayoutAttr layout = nullptr;
- for (OpOperand &use : result.getUses()) {
- if (auto tmpLayout = xegpu::getDistributeLayoutAttr(use)) {
- // debug print the use and op, and the tmpLayout
- LLVM_DEBUG({
- DBGS() << "getLayoutFromUsePoints use: " << use.getOwner()->getName()
- << use.getOwner();
- llvm::dbgs() << ", tmpLayout=" << tmpLayout << "\n";
- });
- // under debug mode, we want to check all the use points to make sure
- // there is no conflict, so we do not break here. In release mode, we can
- // break at the first use
- if (!layout)
- layout = tmpLayout;
- }
- }
- return layout;
-}
-
-// For regular operations: First the result layouts are propagated from uses.
-// Then the result layouts are propagated to uses (operands).
-static void propagateResultsToRegularOperands(Operation *op) {
- LLVM_DEBUG(DBGS() << "propagateResultsToRegularOperands: " << op->getName()
- << " (" << op->getNumOperands() << " operands, "
- << op->getNumResults() << " results)\n");
-
- if (op->getNumResults() == 0) {
- LLVM_DEBUG(DBGS() << " skipping (no results)\n");
- return;
- }
-
- Value result = op->getResult(0);
- xegpu::DistributeLayoutAttr resLayout =
- getLayoutFromUsePoints(op->getResult(0));
- Type resultType = result.getType();
-
- // recover layout for tensor Descriptor type, which is a special case since
- // its layout is not stored as an attribute but encoded in the type itself.
- // For vector type, we attach the layout as an attribute to op.
- if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
- auto layout = tensorDescTy.getLayoutAttr();
- // TODO: remove the layout check. The tensorDescType's layout is treated as
- // temporary layout, which needs to be set by layout recovery.
- // allow it now to pass some legacy test case
- if (!layout) {
- auto typeWithLayout = xegpu::TensorDescType::get(
- tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(), resLayout);
- result.setType(typeWithLayout);
- }
- }
-
- for (OpOperand &opr : op->getOpOperands()) {
- // Layouts are needed for vector type only.
- xegpu::DistributeLayoutAttr operandLayout =
- xegpu::inferSourceLayoutFromResult(opr, resLayout);
- if (!isa<VectorType>(opr.get().getType())) {
- LLVM_DEBUG(DBGS() << " operand #" << opr.getOperandNumber()
- << ": skipped (non-vector type: " << opr.get().getType()
- << ")\n");
- continue;
- }
-
- xegpu::setTemporaryLayout(opr, operandLayout);
- // debug print op
- LLVM_DEBUG(DBGS() << "after propagateResultsToRegularOperands op: "
- << op->getName() << op << " operand #"
- << opr.getOperandNumber()
- << ": type=" << opr.get().getType());
- llvm::dbgs() << ", temp Layout=" << xegpu::getTemporaryLayout(opr);
- llvm::dbgs() << "\n";
- }
-}
-
-static void propagateRegionResultsToYieldOperands(
- mlir::RegionBranchTerminatorOpInterface yieldOp) {
- LLVM_DEBUG(DBGS() << "propagateRegionResultsToYieldOperands: "
- << yieldOp->getName() << " (" << yieldOp->getNumOperands()
- << " operands), parent="
- << yieldOp->getParentOp()->getName() << "\n");
-
- if (isa<func::FuncOp>(yieldOp->getParentOp())) {
- LLVM_DEBUG(DBGS() << " skipping (parent is FuncOp)\n");
- return;
- }
-
- auto regionBranchOp =
- dyn_cast<RegionBranchOpInterface>(yieldOp->getParentOp());
- if (!regionBranchOp) {
- LLVM_DEBUG(DBGS() << " skipping (parent is not RegionBranchOp)\n");
- return;
- }
-
- // Gather layouts for each result of the parent region op from external
- // use points.
- unsigned numResults = regionBranchOp->getNumResults();
- LLVM_DEBUG(DBGS() << " parent op has " << numResults << " results\n");
- if (numResults == 0)
- return;
-
- SmallVector<xegpu::DistributeLayoutAttr> resultLayouts(numResults, nullptr);
- for (unsigned i = 0; i < numResults; ++i) {
- OpResult result = regionBranchOp->getResult(i);
- resultLayouts[i] = getLayoutFromUsePoints(result);
- if (resultLayouts[i]) {
- LLVM_DEBUG(DBGS() << " result #" << i << ": type=" << result.getType()
- << ", layout=" << resultLayouts[i] << "\n");
- xegpu::setTemporaryLayout(result, resultLayouts[i]);
- } else {
- LLVM_DEBUG(DBGS() << " result #" << i
- << ": skipped (no layout from use points)\n");
- }
- }
-
- // Use getSuccessorOperands to find which operands of the terminator
- // flow to a successor. This handles index offsets automatically (e.g.,
- // scf.condition's predicate at operand #0 is excluded).
- // Pick the first successor to determine the operand range.
- SmallVector<RegionSuccessor> successors;
- SmallVector<Attribute> operandAttrs(yieldOp->getNumOperands(), nullptr);
- yieldOp.getSuccessorRegions(operandAttrs, successors);
- assert(!successors.empty() && "terminator must have at least one successor");
-
- OperandRange succOps = yieldOp.getSuccessorOperands(successors.front());
- unsigned beginIdx = succOps.getBeginOperandIndex();
- unsigned count = std::min(static_cast<unsigned>(succOps.size()), numResults);
-
- LLVM_DEBUG(DBGS() << " " << count << " successor operands starting at index "
- << beginIdx << "\n");
-
- for (unsigned i = 0; i < count; ++i) {
- if (!resultLayouts[i])
- continue;
- LLVM_DEBUG(DBGS() << " -> setting layout on operand #" << (beginIdx + i)
- << "\n");
- xegpu::setTemporaryLayout(yieldOp->getOpOperand(beginIdx + i),
- resultLayouts[i]);
- }
-
- LLVM_DEBUG({
- DBGS() << " after propagateRegionResultsToYieldOperands:\n";
- yieldOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
- llvm::dbgs() << "\n";
- });
-}
-
-static void propagateRegionArgsToInits(mlir::RegionBranchOpInterface regionOp) {
- LLVM_DEBUG(DBGS() << "propagateRegionArgsToInits: " << regionOp->getName()
- << " (" << regionOp->getNumOperands() << " operands, "
- << regionOp->getNumRegions() << " regions)\n");
- LLVM_DEBUG({
- DBGS() << " before propagateRegionArgsToInits, Region IR:\n";
- regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
- llvm::dbgs() << "\n";
- });
-
- // Iterate all regions of the region op. For each block argument that has a
- // layout (determined from its use points), trace back to find the
- // corresponding init operand of the regionOp and set the layout on it.
- // This works generically for scf.for, scf.while, and other
- // RegionBranchOpInterface ops.
- for (Region ®ion : regionOp->getRegions()) {
- RegionSuccessor regionSuccessor(®ion);
- for (auto [argIdx, regionArg] : llvm::enumerate(region.getArguments())) {
- auto layout = getLayoutFromUsePoints(regionArg);
+// Attach layout attributes to all vector-type operands of operations within
+// the given operation's region. Reports an error if any vector operand lacks
+// a layout attribute.
+bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
+ auto result = rootOp->walk([&](Operation *op) {
+ for (OpOperand &operand : op->getOpOperands()) {
+ // Layouts are needed for vector type only.
+ if (!isa<VectorType>(operand.get().getType()))
+ continue;
+ // Skip block arguments since they don't have defining ops to attach
+ // layout attributes to.
+ if (isa<BlockArgument>(operand.get()))
+ continue;
+ auto layout = xegpu::getDistributeLayoutAttr(operand.get());
if (!layout) {
- LLVM_DEBUG(DBGS() << " region #" << region.getRegionNumber()
- << " arg #" << argIdx << ": skipped (no layout)\n");
+ op->emitWarning("Could not find layout attribute for operand ")
+ << operand.getOperandNumber() << " of operation " << op->getName();
continue;
}
- LLVM_DEBUG(DBGS() << " region #" << region.getRegionNumber() << " arg #"
- << argIdx << ": type=" << regionArg.getType()
- << ", layout=" << layout << "\n");
-
- // Find all predecessor values that flow into this block argument.
- SmallVector<Value> predValues;
- regionOp.getPredecessorValues(regionSuccessor, argIdx - 1, predValues);
- for (Value predVal : predValues) {
- // Match predecessor value to an operand of the regionOp.
- for (OpOperand &operand : regionOp->getOpOperands()) {
- if (operand.get() == predVal) {
- LLVM_DEBUG(DBGS() << " -> setting layout on init operand #"
- << operand.getOperandNumber() << "\n");
- xegpu::setTemporaryLayout(operand, layout);
- }
- }
- }
+ xegpu::setTemporaryLayout(operand, layout);
}
- }
-
- LLVM_DEBUG({
- DBGS() << " after propagateRegionArgsToInits, Region IR:\n";
- regionOp.print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
- llvm::dbgs() << "\n";
- });
-}
-
-bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
- LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts START ===\n");
-
- auto processFunc = [&](Region &body, StringRef funcName) {
- LLVM_DEBUG(DBGS() << "Processing func: " << funcName << "\n");
- walkRegionBackward(body, [&](Operation *op) {
- LLVM_DEBUG(DBGS() << "Visiting op: " << op->getName());
- if (auto regionOp = dyn_cast<mlir::RegionBranchOpInterface>(op)) {
- // hit the region op after visiting inside region
- LLVM_DEBUG(DBGS() << " -> dispatching as RegionBranchOp\n");
- propagateRegionArgsToInits(regionOp);
- } else if (auto yieldOp =
- dyn_cast<mlir::RegionBranchTerminatorOpInterface>(op)) {
- // yield op inside region op
- LLVM_DEBUG(DBGS() << " -> dispatching as YieldOp\n");
- propagateRegionResultsToYieldOperands(yieldOp);
- } else if (!dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
- // if the op is regular op, calling propagateResultsToRegularOperands
- LLVM_DEBUG(DBGS() << " -> dispatching as regular op\n");
- propagateResultsToRegularOperands(op);
- }
- });
- };
-
- rootOp->walk([&](func::FuncOp func) {
- processFunc(func.getBody(), func.getSymName());
- });
- rootOp->walk([&](gpu::GPUFuncOp func) {
- processFunc(func.getBody(), func.getName());
- });
-
- LLVM_DEBUG(DBGS() << "=== recoverTemporaryLayouts END ===\n");
- // print the root op after
- LLVM_DEBUG({
- DBGS() << "After recoverTemporaryLayouts, IR:\n";
- rootOp->print(llvm::dbgs(), OpPrintingFlags().printGenericOpForm());
- llvm::dbgs() << "\n";
+ return WalkResult::advance();
});
- return true;
+ return !result.wasInterrupted();
}
-// // Attach layout attributes to all vector-type operands of operations within
-// // the given operation's region. Reports an error if any vector operand lacks
-// // a layout attribute.
-// bool xegpu::recoverTemporaryLayouts(Operation *rootOp) {
-// auto result = rootOp->walk([&](Operation *op) {
-// for (OpOperand &operand : op->getOpOperands()) {
-// // Layouts are needed for vector type only.
-// if (!isa<VectorType>(operand.get().getType()))
-// continue;
-// // Skip block arguments since they don't have defining ops to attach
-// // layout attributes to.
-// if (isa<BlockArgument>(operand.get()))
-// continue;
-// auto layout = xegpu::getDistributeLayoutAttr(operand.get());
-// if (!layout) {
-// op->emitWarning("Could not find layout attribute for operand ")
-// << operand.getOperandNumber() << " of operation " <<
-// op->getName();
-// xegpu::setTemporaryLayout(operand, layout);
-// continue;
-// }
-// }
-// return WalkResult::advance();
-// });
-// return !result.wasInterrupted();
-// }
-
template <typename T, typename>
void xegpu::removeLayoutAttr(const T &operandOrResult) {
Operation *owner = operandOrResult.getOwner();
@@ -1403,153 +1108,99 @@ xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
return std::nullopt;
}
-xegpu::DistributeLayoutAttr
-xegpu::inferSourceLayoutFromResult(OpOperand &operand,
- xegpu::DistributeLayoutAttr resLayout) {
- if (!resLayout) {
- LLVM_DEBUG(DBGS() << "no resLayout, returning null\n");
- return xegpu::DistributeLayoutAttr();
- }
+xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
Operation *op = operand.getOwner();
unsigned idx = operand.getOperandNumber();
+ xegpu::DistributeLayoutAttr resLayout;
+ if (op->getNumResults() == 1)
+ resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
// For vector::BroadcastOp, infer the source layout from the result layout.
if (auto broadcast = dyn_cast<vector::BroadcastOp>(op)) {
- LLVM_DEBUG(DBGS() << " -> BroadcastOp\n");
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
auto srcTy = dyn_cast<VectorType>(broadcast.getSourceType());
- if (!srcTy) {
- LLVM_DEBUG(DBGS() << " source is not VectorType, returning null\n");
+ if (!srcTy)
return xegpu::DistributeLayoutAttr();
- }
- auto inferred = xegpu::inferBroadcastSourceLayout(
+ return xegpu::inferBroadcastSourceLayout(
resLayout, broadcast.getResultVectorType().getShape(),
srcTy.getShape());
- LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
- return inferred;
}
// For vector::MultiDimReductionOp, infer source layout from result layout
// using reduction dims. Acc operand is expected to have the same layout as
// the result.
if (auto reduction = dyn_cast<vector::MultiDimReductionOp>(op)) {
- LLVM_DEBUG(DBGS() << " -> MultiDimReductionOp, operand idx=" << idx
- << "\n");
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
if (idx == 0) {
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
- LLVM_DEBUG({
- DBGS() << " reductionDims=[";
- llvm::interleaveComma(reductionDims, llvm::dbgs());
- llvm::dbgs() << "]\n";
- });
- auto inferred =
- xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
- LLVM_DEBUG(DBGS() << " inferred source layout=" << inferred << "\n");
- return inferred;
+ return xegpu::inferMultiReductionSourceLayout(resLayout, reductionDims);
}
- if (idx == 1) {
- LLVM_DEBUG(DBGS() << " acc operand, using resLayout\n");
+ if (idx == 1)
return resLayout;
- }
}
if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
- LLVM_DEBUG(DBGS() << " -> ReductionOp\n");
- auto inferred = xegpu::inferReductionSourceLayout(resLayout);
- LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
- return inferred;
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
+ return xegpu::inferReductionSourceLayout(resLayout);
}
// For vector::BitCastOp, infer source layout from result layout using
// element type bitwidths.
if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
- LLVM_DEBUG(DBGS() << " -> BitCastOp\n");
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
int resElemBitWidth =
bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
int srcElemBitWidth =
bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
- LLVM_DEBUG(DBGS() << " resBitWidth=" << resElemBitWidth
- << ", srcBitWidth=" << srcElemBitWidth << "\n");
- auto inferred = xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
- srcElemBitWidth);
- LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
- return inferred;
+ return xegpu::inferBitCastSourceLayout(resLayout, resElemBitWidth,
+ srcElemBitWidth);
}
// For vector::ShapeCastOp, infer source layout from result layout using
// shapes.
if (auto shapeCast = dyn_cast<vector::ShapeCastOp>(op)) {
- LLVM_DEBUG({
- DBGS() << " -> ShapeCastOp: resShape=[";
- llvm::interleaveComma(shapeCast.getResultVectorType().getShape(),
- llvm::dbgs());
- llvm::dbgs() << "], srcShape=[";
- llvm::interleaveComma(shapeCast.getSourceVectorType().getShape(),
- llvm::dbgs());
- llvm::dbgs() << "]\n";
- });
- auto inferred = xegpu::inferShapeCastSourceLayout(
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
+ return xegpu::inferShapeCastSourceLayout(
resLayout, shapeCast.getResultVectorType().getShape(),
shapeCast.getSourceVectorType().getShape());
- LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
- return inferred;
}
// For vector::InsertStridedSliceOp, infer source layout from result layout.
// Dest vector must have the same layout as the result.
if (auto insertSlice = dyn_cast<vector::InsertStridedSliceOp>(op)) {
- LLVM_DEBUG(DBGS() << " -> InsertStridedSliceOp, operand idx=" << idx
- << "\n");
- if (idx == 0) {
- auto inferred = xegpu::inferInsertStridedSliceSourceLayout(
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
+ if (idx == 0)
+ return xegpu::inferInsertStridedSliceSourceLayout(
resLayout, insertSlice.getDestVectorType().getShape(),
insertSlice.getSourceVectorType().getShape());
- LLVM_DEBUG(DBGS() << " inferred source layout=" << inferred << "\n");
- return inferred;
- }
- if (idx == 1) {
- LLVM_DEBUG(DBGS() << " dest operand, using resLayout\n");
+ if (idx == 1)
return resLayout;
- }
}
// For vector::TransposeOp, infer source layout from result layout using
// permutation.
if (auto transpose = dyn_cast<vector::TransposeOp>(op)) {
- LLVM_DEBUG({
- DBGS() << " -> TransposeOp, perm=[";
- llvm::interleaveComma(transpose.getPermutation(), llvm::dbgs());
- llvm::dbgs() << "]\n";
- });
- auto inferred = xegpu::inferTransposeSourceLayout(
- resLayout, transpose.getPermutation());
- LLVM_DEBUG(DBGS() << " inferred=" << inferred << "\n");
- return inferred;
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
+ return xegpu::inferTransposeSourceLayout(resLayout,
+ transpose.getPermutation());
}
// For elementwise operations, all operands must have the same layout as the
// result.
if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
- LLVM_DEBUG(DBGS() << " -> elementwise op, using resLayout="
- << (resLayout ? resLayout : nullptr) << "\n");
-
+ if (!resLayout)
+ return xegpu::DistributeLayoutAttr();
return resLayout;
}
- return xegpu::DistributeLayoutAttr();
-}
-
-xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
- Operation *op = operand.getOwner();
- xegpu::DistributeLayoutAttr resLayout;
- if (op->getNumResults() == 1)
- resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
- auto inferredOperandLayout = inferSourceLayoutFromResult(operand, resLayout);
- if (inferredOperandLayout)
- return inferredOperandLayout;
+ // TODO: Handle more cases as needed here.
// By default, assume no layout conflict and return the current layout of
// the operand.
- auto fallback = xegpu::getDistributeLayoutAttr(operand.get());
- LLVM_DEBUG(DBGS() << " -> fallback (unhandled op " << op->getName()
- << "), returning operand layout="
- << (fallback ? fallback : nullptr) << "\n");
- return fallback;
+ return xegpu::getDistributeLayoutAttr(operand.get());
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index f0ff771f4cbc4..4c30dacae8850 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1338,7 +1338,7 @@ LogicalResult ResolveLayoutConflicts::run() {
// as anchor op for the reduction op's layout.
if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
for (OpResult result : op->getResults()) {
- if (result.getType().isIntOrFloat() || result.use_empty()) {
+ if (result.getType().isIntOrFloat()) {
auto res = assignResultLayout(result);
if (failed(res)) {
DBGS() << "Failed to resolve vector consumer for multi-reduction "
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 0d1bfd5480aa2..842c2375dd31d 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -473,6 +473,22 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
gpu.return
}
+// CHECK-LABEL: gpu.func @vector_transpose
+// CHECK: %[[SRC:.*]] = "some_op"()
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : vector<16x2xf32> to vector<1x2xf32>
+// CHECK-NEXT: %[[T:.*]] = vector.transpose %[[CAST]], [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+// CHECK-NEXT: gpu.return
+gpu.func @vector_transpose() {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
+ : () -> (vector<16x2xf32>)
+ %transpose = vector.transpose %cst, [1, 0]
+ {
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16x2xf32> to vector<2x16xf32>
+ gpu.return
+}
// CHECK-LABEL: gpu.func @vector_bitcast
// CHECK: %[[SRC:.*]] = "some_op"()
@@ -1076,8 +1092,7 @@ gpu.module @xevm_module {
gpu.func @vector_broadcast_2d_to_2d_noop(%laneid: index) {
%0 = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x1xf16>
%1 = vector.broadcast %0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
- %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>
- "some_use"(%2) : (vector<16x16xf16>) -> ()
+ "some_use"(%1) : (vector<16x16xf16>) -> ()
gpu.return
}
}
>From 1328c5ff1981598a4ed9ff102f1ac17360cbd6c4 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:37:55 +0000
Subject: [PATCH 6/7] cleanup
---
.../Transforms/XeGPUPeepHoleOptimizer.cpp | 15 +---
.../Transforms/XeGPUSubgroupDistribute.cpp | 38 +---------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 74 +++----------------
3 files changed, 12 insertions(+), 115 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index c43eaba5b3ee6..c488bca363da6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -28,7 +28,6 @@
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
#include <optional>
namespace mlir {
@@ -151,22 +150,12 @@ static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
auto laneLayoutI64 = origLayout.getEffectiveLaneLayoutAsInt();
SmallVector<int32_t> laneLayoutI32(laneLayoutI64.begin(),
laneLayoutI64.end());
- LLVM_DEBUG({
- DBGS() << "tryOptimize: origLayout=" << origLayout << "\n";
- DBGS() << " laneLayoutI32=[";
- llvm::interleaveComma(laneLayoutI32, llvm::dbgs());
- llvm::dbgs() << "], laneData=[1, 1]";
- if (origLayout.getOrder())
- llvm::dbgs() << ", order=" << origLayout.getOrder();
- llvm::dbgs() << "\n";
- DBGS() << " supportedShape=[" << supportedHeight << ", " << supportedWidth
- << "], newElemTy=" << newElemTy << ", arrayLen=" << arrayLen << "\n";
- });
+
xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
ctx, /*lane_layout=*/DenseI32ArrayAttr::get(ctx, laneLayoutI32),
/*lane_data=*/DenseI32ArrayAttr::get(ctx, {1, 1}),
/*order=*/origLayout.getOrder());
- LLVM_DEBUG(DBGS() << " newLayout=" << newLayout << "\n");
+
// Array length can not be larger than 1 for transpose case.
return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
tdescType.getBoundaryCheck(),
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 27cf788933f18..d8ce24ddd5cb0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -800,17 +800,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
- LLVM_DEBUG(DBGS() << "StoreDistribution: attempting to match\n");
Operation *lastNode = warpOp.getTerminator()->getPrevNode();
auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
- if (!storeScatterOp) {
- LLVM_DEBUG(
- DBGS()
- << "StoreDistribution: last node is not StoreScatterOp, skipping\n");
+ if (!storeScatterOp)
return failure();
- }
- LLVM_DEBUG(DBGS() << "StoreDistribution: matched StoreScatterOp: "
- << *storeScatterOp << "\n");
auto offsets = storeScatterOp.getOffsets();
if (!offsets || !isa<VectorType>(offsets.getType()))
return rewriter.notifyMatchFailure(
@@ -818,15 +811,10 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
- LLVM_DEBUG(DBGS() << "StoreDistribution: offsetsTy=" << offsetsTy
- << ", maskTy=" << maskTy << ", storeVecTy=" << storeVecTy
- << "\n");
// Add handling for leading unit dimensions support
int chunkSize = storeScatterOp.getChunkSize().value_or(1);
int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
- LLVM_DEBUG(DBGS() << "StoreDistribution: chunkSize=" << chunkSize
- << ", effectiveVecRank=" << effectiveVecRank << "\n");
// Check that all leading dimensions are unit dimensions
for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
@@ -843,24 +831,6 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
auto layoutMask =
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
- LLVM_DEBUG({
- DBGS() << "StoreDistribution: layoutPayload=";
- if (layoutPayload)
- DBGS() << layoutPayload;
- else
- DBGS() << "(null)";
- DBGS() << ", layoutOffsets=";
- if (layoutOffsets)
- DBGS() << layoutOffsets;
- else
- DBGS() << "(null)";
- DBGS() << ", layoutMask=";
- if (layoutMask)
- DBGS() << layoutMask;
- else
- DBGS() << "(null)";
- DBGS() << "\n";
- });
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
@@ -879,9 +849,6 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
- LLVM_DEBUG(DBGS() << "StoreDistribution: distPayloadTy=" << distPayloadTy
- << ", distOffsetsTy=" << distOffsetsTy
- << ", distMaskTy=" << distMaskTy << "\n");
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
@@ -918,10 +885,7 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
- LLVM_DEBUG(DBGS() << "StoreDistribution: created new op: " << newOp
- << "\n");
rewriter.eraseOp(storeScatterOp);
- LLVM_DEBUG(DBGS() << "StoreDistribution: done\n");
return success();
}
};
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 55cf47e38dfd0..bcac517937754 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -23,14 +23,10 @@
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>
-#define DEBUG_TYPE "xegpu-utils"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
using namespace mlir;
/// convert ArrayRef<ValueRange> into SmallVector<Value>
@@ -149,31 +145,19 @@ std::string xegpu::getTemporaryLayoutName(const OpResult result) {
}
xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
- LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(Value): type="
- << value.getType() << "\n");
- if (!value) {
- LLVM_DEBUG(DBGS() << " -> null value, returning nullptr\n");
+ if (!value)
return nullptr;
- }
if (auto tdescTy =
- dyn_cast_if_present<xegpu::TensorDescType>(value.getType())) {
- auto layout = tdescTy.getLayoutAttr();
- LLVM_DEBUG(DBGS() << " -> TensorDescType, layout="
- << (layout ? layout : nullptr) << "\n");
- return layout;
- }
+ dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
+ return tdescTy.getLayoutAttr();
if (auto result = dyn_cast<OpResult>(value)) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");
- LLVM_DEBUG(DBGS() << " OpResult #" << result.getResultNumber() << " from "
- << defOp->getName() << "\n");
if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
auto layout = anchorOp.getAnchorLayout();
- LLVM_DEBUG(DBGS() << " -> AnchorLayoutInterface, layout="
- << (layout ? layout : nullptr) << "\n");
return layout;
}
@@ -181,100 +165,60 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
if (defOp->hasAttr(layoutName)) {
auto layout =
defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
- LLVM_DEBUG(DBGS() << " -> temporary attr '" << layoutName
- << "', layout=" << layout << "\n");
return layout;
}
- LLVM_DEBUG(DBGS() << " -> OpResult: no layout found (checked '"
- << layoutName << "')\n");
}
if (auto arg = dyn_cast<BlockArgument>(value)) {
auto *parentOp = arg.getOwner()->getParentOp();
- LLVM_DEBUG(DBGS() << " BlockArgument #" << arg.getArgNumber() << " of "
- << (parentOp ? parentOp->getName().getStringRef()
- : StringRef("(null)"))
- << "\n");
if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
if (tiedInit) {
- LLVM_DEBUG(DBGS() << " -> LoopLikeOp, recursing into tiedInit "
- << "operand #" << tiedInit->getOperandNumber()
- << "\n");
return getDistributeLayoutAttr(tiedInit->get());
}
- LLVM_DEBUG(DBGS() << " -> LoopLikeOp, no tiedInit\n");
}
}
- LLVM_DEBUG(DBGS() << " -> returning nullptr\n");
return nullptr;
}
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();
- LLVM_DEBUG(DBGS() << "getDistributeLayoutAttr(OpOperand): operand #" << idx
- << " of " << op->getName()
- << ", type=" << opr.get().getType() << "\n");
if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
if (idx == 0) {
- auto layout = dpasOp.getLayoutAAttr();
- LLVM_DEBUG(DBGS() << " -> DpasOp layoutA="
- << (layout ? layout : nullptr) << "\n");
- return layout;
+ return dpasOp.getLayoutAAttr();
} else if (idx == 1) {
- auto layout = dpasOp.getLayoutBAttr();
- LLVM_DEBUG(DBGS() << " -> DpasOp layoutB="
- << (layout ? layout : nullptr) << "\n");
- return layout;
+ return dpasOp.getLayoutBAttr();
} else if (idx == 2) {
- auto layout = dpasOp.getLayoutCdAttr();
- LLVM_DEBUG(DBGS() << " -> DpasOp layoutCd="
- << (layout ? layout : nullptr) << "\n");
- return layout;
+ return dpasOp.getLayoutCdAttr();
}
}
if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
- auto layout = convertOp.getInputLayoutAttr();
- LLVM_DEBUG(DBGS() << " -> ConvertLayoutOp inputLayout="
- << (layout ? layout : nullptr) << "\n");
- return layout;
+ return convertOp.getInputLayoutAttr();
}
auto layout = anchorOp.getAnchorLayout();
- if (idx == 0) {
- LLVM_DEBUG(DBGS() << " -> AnchorLayoutInterface idx=0, layout="
- << (layout ? layout : nullptr) << "\n");
+ if (idx == 0)
return layout;
- }
// For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
// the layout is valid for the first two operands: value and memref/tdesc.
// For other operations, the layout applies to the first operand only.
if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
op) &&
- (idx < 2)) {
- LLVM_DEBUG(DBGS() << " -> Store op idx=" << idx
- << ", layout=" << (layout ? layout : nullptr) << "\n");
+ (idx < 2))
return layout;
- }
- LLVM_DEBUG(DBGS() << " -> AnchorLayoutInterface idx=" << idx
- << " not covered, falling through\n");
}
std::string layoutName = xegpu::getTemporaryLayoutName(opr);
if (op->hasAttr(layoutName)) {
auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
- LLVM_DEBUG(DBGS() << " -> temporary attr '" << layoutName
- << "', layout=" << layout << "\n");
return layout;
}
- LLVM_DEBUG(DBGS() << " -> returning nullptr (checked '" << layoutName
- << "')\n");
return nullptr;
}
>From 2617a0258e5435f141292f85c5633a8574bdaebd Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Fri, 3 Apr 2026 20:41:17 +0000
Subject: [PATCH 7/7] cleanup
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index bcac517937754..f0508a30621f2 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -173,9 +173,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
auto *parentOp = arg.getOwner()->getParentOp();
if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
OpOperand *tiedInit = loop.getTiedLoopInit(arg);
- if (tiedInit) {
+ if (tiedInit)
return getDistributeLayoutAttr(tiedInit->get());
- }
}
}
More information about the Mlir-commits
mailing list