[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)

Thu Apr 24 08:57:00 PDT 2025

================
@@ -633,6 +675,743 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// LayoutAttrAssignment
+///===----------------------------------------------------------------------===///
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+  LayoutAttrAssignment(Operation *top,
+                       function_ref<LayoutInfo(Value)> getLayout)
+      : getAnalysisResult(getLayout), top(top) {}
+
+  LogicalResult run();
+
+private:
+  LogicalResult assign(Operation *op);
+  void assignToUsers(Value v, xegpu::LayoutAttr layout);
+  xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+  LogicalResult resolveConflicts();
+  function_ref<LayoutInfo(Value)>
+      getAnalysisResult; // Callable to get the layout of a value based on the
+                         // layout propagation analysis.
+  Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+  for (OpOperand &user : v.getUses()) {
+    Operation *owner = user.getOwner();
+    unsigned operandNumber = user.getOperandNumber();
+    /// Use a generic name for ease of querying the layout attribute later.
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
+    owner->setAttr(attrName, layout);
+  }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+  auto layout = getAnalysisResult(v);
+  if (!layout.isAssigned())
+    return {};
+  SmallVector<int, 2> laneLayout, laneData;
+  for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                             layout.getDataAsArrayRef())) {
+    laneLayout.push_back(static_cast<int>(layout));
+    laneData.push_back(static_cast<int>(data));
+  }
+  return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+  /// For function ops, propagate the function argument layout to the users.
+  if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+    for (auto arg : func.getArguments()) {
+      auto layoutInfo = getLayoutAttrForValue(arg);
+      if (layoutInfo) {
+        assignToUsers(arg, layoutInfo);
+      }
+    }
+    return success();
+  }
+  /// If no results, move on.
+  if (op->getNumResults() == 0)
+    return success();
+  /// If all the results are scalars, move on.
+  if (llvm::all_of(op->getResultTypes(),
+                   [](Type t) { return t.isIntOrIndexOrFloat(); }))
+    return success();
+  /// If the result is a tensor descriptor, attach the layout to the tensor
+  /// descriptor itself.
+  if (auto tensorDescTy =
+          dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+    auto layoutInfo = getLayoutAttrForValue(op->getResult(0));
+    if (!layoutInfo) {
+      LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+      return failure();
+    }
+
+    /// Clone the op, attach the layout to the result tensor descriptor, and
+    /// remove the original op.
+    OpBuilder builder(op);
+    auto *newOp = builder.clone(*op);
+    auto newTensorDescTy = xegpu::TensorDescType::get(
+        tensorDescTy.getContext(), tensorDescTy.getShape(),
+        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+    newOp->getResult(0).setType(newTensorDescTy);
+    op->replaceAllUsesWith(newOp->getResults());
+    op->erase();
+    return success();
+  }
+  /// Otherwise simply attach the layout to the op itself.
+  for (auto [i, r] : llvm::enumerate(op->getResults())) {
+    auto layoutInfo = getLayoutAttrForValue(r);
+    if (layoutInfo) {
+      auto attrName = resultLayoutNamePrefix + std::to_string(i);
+      op->setAttr(attrName, layoutInfo);
+      /// Attach the layout attribute to the users of the result.
+      assignToUsers(r, layoutInfo);
+    }
+  }
+  return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+  auto walkResult = top->walk([&](Operation *op) {
+    if (failed(assign(op)))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return failure();
+
+  return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+///    HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+///    be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Helper function to get  distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If array_length >
+/// 1, that is appended to the front of the disributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+                                                      VectorType originalType) {
+  if (!layout)
+    return failure();
+
+  auto laneLayout = layout.getLaneLayout().asArrayRef();
+  assert(originalType.getShape().size() >= laneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  /// Only distribute the last `laneLayout.size()` dimensions. The remaining
+  /// dimensions are not distributed.
+  unsigned distributionStart = originalType.getRank() - laneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart) {
+      continue;
+    }
+    /// Check if the dimension can be distributed evenly.
+    if (dim % laneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / laneLayout[i - distributionStart];
+  }
+  return VectorType::get(distributedShape, originalType.getElementType());
+}
+
+/// Get the distributed vector type for a source vector type according to a
+/// xegpu::LayoutAttr.
+static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
+                                           VectorType originalType) {
+  auto shape = originalType.getShape();
+  auto distVecTyOrFailure =
+      xegpu::TensorDescType::get(shape, originalType.getElementType(),
----------------
chencha3 wrote:

Sounds good to me. How do you feel about making them as static method for XeGPUDialect? I met a similar issue and did so in https://github.com/llvm/llvm-project/pull/137010. Alternatively, we can create an Utils file.

https://github.com/llvm/llvm-project/pull/135271