[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)

Thu Apr 24 10:13:51 PDT 2025

================
@@ -633,6 +675,743 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// LayoutAttrAssignment
+///===----------------------------------------------------------------------===///
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+  LayoutAttrAssignment(Operation *top,
+                       function_ref<LayoutInfo(Value)> getLayout)
+      : getAnalysisResult(getLayout), top(top) {}
+
+  LogicalResult run();
+
+private:
+  LogicalResult assign(Operation *op);
+  void assignToUsers(Value v, xegpu::LayoutAttr layout);
+  xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+  LogicalResult resolveConflicts();
+  function_ref<LayoutInfo(Value)>
+      getAnalysisResult; // Callable to get the layout of a value based on the
+                         // layout propagation analysis.
+  Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+  for (OpOperand &user : v.getUses()) {
+    Operation *owner = user.getOwner();
+    unsigned operandNumber = user.getOperandNumber();
+    /// Use a generic name for ease of querying the layout attribute later.
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
+    owner->setAttr(attrName, layout);
+  }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+  auto layout = getAnalysisResult(v);
+  if (!layout.isAssigned())
+    return {};
+  SmallVector<int, 2> laneLayout, laneData;
+  for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                             layout.getDataAsArrayRef())) {
+    laneLayout.push_back(static_cast<int>(layout));
+    laneData.push_back(static_cast<int>(data));
+  }
+  return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+  /// For function ops, propagate the function argument layout to the users.
+  if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+    for (auto arg : func.getArguments()) {
+      auto layoutInfo = getLayoutAttrForValue(arg);
+      if (layoutInfo) {
+        assignToUsers(arg, layoutInfo);
+      }
+    }
+    return success();
+  }
+  /// If no results, move on.
+  if (op->getNumResults() == 0)
+    return success();
+  /// If all the results are scalars, move on.
+  if (llvm::all_of(op->getResultTypes(),
+                   [](Type t) { return t.isIntOrIndexOrFloat(); }))
+    return success();
+  /// If the result is a tensor descriptor, attach the layout to the tensor
+  /// descriptor itself.
+  if (auto tensorDescTy =
+          dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+    auto layoutInfo = getLayoutAttrForValue(op->getResult(0));
+    if (!layoutInfo) {
+      LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+      return failure();
+    }
+
+    /// Clone the op, attach the layout to the result tensor descriptor, and
+    /// remove the original op.
+    OpBuilder builder(op);
+    auto *newOp = builder.clone(*op);
+    auto newTensorDescTy = xegpu::TensorDescType::get(
+        tensorDescTy.getContext(), tensorDescTy.getShape(),
+        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+    newOp->getResult(0).setType(newTensorDescTy);
+    op->replaceAllUsesWith(newOp->getResults());
+    op->erase();
+    return success();
+  }
+  /// Otherwise simply attach the layout to the op itself.
+  for (auto [i, r] : llvm::enumerate(op->getResults())) {
+    auto layoutInfo = getLayoutAttrForValue(r);
+    if (layoutInfo) {
+      auto attrName = resultLayoutNamePrefix + std::to_string(i);
+      op->setAttr(attrName, layoutInfo);
+      /// Attach the layout attribute to the users of the result.
+      assignToUsers(r, layoutInfo);
+    }
+  }
+  return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+  auto walkResult = top->walk([&](Operation *op) {
+    if (failed(assign(op)))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return failure();
+
+  return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+///    HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+///    be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Helper function to get  distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If array_length >
+/// 1, that is appended to the front of the disributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+                                                      VectorType originalType) {
+  if (!layout)
+    return failure();
+
+  auto laneLayout = layout.getLaneLayout().asArrayRef();
+  assert(originalType.getShape().size() >= laneLayout.size() &&
----------------
charithaintc wrote:

yes only last two dims are distributed to satisfy the requirements of `gpu.warp_execute_on_lane0`. Obviously this is not the vector type we use at SIMT xegpu level. So we add some shape_cast to resolve the type once the op is sinked.

Example:
```
  %c0 = arith.constant 0 : index
  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
  %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
  %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
  %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
  xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
```
To
```
      %c0 = arith.constant 0 : index
      %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
      %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
      %2 = vector.shape_cast %1 : vector<32xf16> to vector<2x16x1xf16>
      %3 = vector.extract %2[0] : vector<16x1xf16> from vector<2x16x1xf16>
      %4 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
      %5 = vector.shape_cast %3 : vector<16x1xf16> to vector<16xf16>
      xegpu.store_nd %5, %4  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
      gpu.return
```

In here the load returns `32xf16`. but `vector.extract` expects type to be `2x16x1` (due to how the distribution works for it). So we insert a `shape_cast` there (`%2`)

https://github.com/llvm/llvm-project/pull/135271