[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)
Charitha Saumya
llvmlistbot at llvm.org
Thu Apr 24 10:13:51 PDT 2025
================
@@ -633,6 +675,743 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// LayoutAttrAssignment
+///===----------------------------------------------------------------------===///
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+ LayoutAttrAssignment(Operation *top,
+ function_ref<LayoutInfo(Value)> getLayout)
+ : getAnalysisResult(getLayout), top(top) {}
+
+ LogicalResult run();
+
+private:
+ LogicalResult assign(Operation *op);
+ void assignToUsers(Value v, xegpu::LayoutAttr layout);
+ xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+ LogicalResult resolveConflicts();
+ function_ref<LayoutInfo(Value)>
+ getAnalysisResult; // Callable to get the layout of a value based on the
+ // layout propagation analysis.
+ Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+ for (OpOperand &user : v.getUses()) {
+ Operation *owner = user.getOwner();
+ unsigned operandNumber = user.getOperandNumber();
+ /// Use a generic name for ease of querying the layout attribute later.
+ std::string attrName =
+ operandLayoutNamePrefix + std::to_string(operandNumber);
+ owner->setAttr(attrName, layout);
+ }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+ auto layout = getAnalysisResult(v);
+ if (!layout.isAssigned())
+ return {};
+ SmallVector<int, 2> laneLayout, laneData;
+ for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+ layout.getDataAsArrayRef())) {
+ laneLayout.push_back(static_cast<int>(layout));
+ laneData.push_back(static_cast<int>(data));
+ }
+ return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+ /// For function ops, propagate the function argument layout to the users.
+ if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+ for (auto arg : func.getArguments()) {
+ auto layoutInfo = getLayoutAttrForValue(arg);
+ if (layoutInfo) {
+ assignToUsers(arg, layoutInfo);
+ }
+ }
+ return success();
+ }
+ /// If no results, move on.
+ if (op->getNumResults() == 0)
+ return success();
+ /// If all the results are scalars, move on.
+ if (llvm::all_of(op->getResultTypes(),
+ [](Type t) { return t.isIntOrIndexOrFloat(); }))
+ return success();
+ /// If the result is a tensor descriptor, attach the layout to the tensor
+ /// descriptor itself.
+ if (auto tensorDescTy =
+ dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+ auto layoutInfo = getLayoutAttrForValue(op->getResult(0));
+ if (!layoutInfo) {
+ LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+ return failure();
+ }
+
+ /// Clone the op, attach the layout to the result tensor descriptor, and
+ /// remove the original op.
+ OpBuilder builder(op);
+ auto *newOp = builder.clone(*op);
+ auto newTensorDescTy = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+ newOp->getResult(0).setType(newTensorDescTy);
+ op->replaceAllUsesWith(newOp->getResults());
+ op->erase();
+ return success();
+ }
+ /// Otherwise simply attach the layout to the op itself.
+ for (auto [i, r] : llvm::enumerate(op->getResults())) {
+ auto layoutInfo = getLayoutAttrForValue(r);
+ if (layoutInfo) {
+ auto attrName = resultLayoutNamePrefix + std::to_string(i);
+ op->setAttr(attrName, layoutInfo);
+ /// Attach the layout attribute to the users of the result.
+ assignToUsers(r, layoutInfo);
+ }
+ }
+ return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+ auto walkResult = top->walk([&](Operation *op) {
+ if (failed(assign(op)))
+ return WalkResult::interrupt();
+ return WalkResult::advance();
+ });
+
+ if (walkResult.wasInterrupted())
+ return failure();
+
+ return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+/// HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+/// be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Helper function to get distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If array_length >
+/// 1, that is appended to the front of the disributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+ VectorType originalType) {
+ if (!layout)
+ return failure();
+
+ auto laneLayout = layout.getLaneLayout().asArrayRef();
+ assert(originalType.getShape().size() >= laneLayout.size() &&
----------------
charithaintc wrote:
yes only last two dims are distributed to satisfy the requirements of `gpu.warp_execute_on_lane0`. Obviously this is not the vector type we use at SIMT xegpu level. So we add some shape_cast to resolve the type once the op is sinked.
Example:
```
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
%1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
%2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
%3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
```
To
```
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
%1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
%2 = vector.shape_cast %1 : vector<32xf16> to vector<2x16x1xf16>
%3 = vector.extract %2[0] : vector<16x1xf16> from vector<2x16x1xf16>
%4 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
%5 = vector.shape_cast %3 : vector<16x1xf16> to vector<16xf16>
xegpu.store_nd %5, %4 : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.return
```
In here the load returns `32xf16`. but `vector.extract` expects type to be `2x16x1` (due to how the distribution works for it). So we insert a `shape_cast` there (`%2`)
https://github.com/llvm/llvm-project/pull/135271
More information about the Mlir-commits
mailing list