[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)

Wed Apr 23 07:37:08 PDT 2025

================
@@ -633,6 +675,743 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// LayoutAttrAssignment
+///===----------------------------------------------------------------------===///
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+  LayoutAttrAssignment(Operation *top,
+                       function_ref<LayoutInfo(Value)> getLayout)
+      : getAnalysisResult(getLayout), top(top) {}
+
+  LogicalResult run();
+
+private:
+  LogicalResult assign(Operation *op);
+  void assignToUsers(Value v, xegpu::LayoutAttr layout);
+  xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+  LogicalResult resolveConflicts();
+  function_ref<LayoutInfo(Value)>
+      getAnalysisResult; // Callable to get the layout of a value based on the
+                         // layout propagation analysis.
+  Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+  for (OpOperand &user : v.getUses()) {
+    Operation *owner = user.getOwner();
+    unsigned operandNumber = user.getOperandNumber();
+    /// Use a generic name for ease of querying the layout attribute later.
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
+    owner->setAttr(attrName, layout);
+  }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+  auto layout = getAnalysisResult(v);
+  if (!layout.isAssigned())
+    return {};
+  SmallVector<int, 2> laneLayout, laneData;
+  for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                             layout.getDataAsArrayRef())) {
+    laneLayout.push_back(static_cast<int>(layout));
+    laneData.push_back(static_cast<int>(data));
+  }
+  return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+  /// For function ops, propagate the function argument layout to the users.
+  if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+    for (auto arg : func.getArguments()) {
+      auto layoutInfo = getLayoutAttrForValue(arg);
+      if (layoutInfo) {
+        assignToUsers(arg, layoutInfo);
+      }
+    }
+    return success();
+  }
+  /// If no results, move on.
+  if (op->getNumResults() == 0)
+    return success();
+  /// If all the results are scalars, move on.
+  if (llvm::all_of(op->getResultTypes(),
+                   [](Type t) { return t.isIntOrIndexOrFloat(); }))
+    return success();
+  /// If the result is a tensor descriptor, attach the layout to the tensor
+  /// descriptor itself.
+  if (auto tensorDescTy =
+          dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+    auto layoutInfo = getLayoutAttrForValue(op->getResult(0));
+    if (!layoutInfo) {
+      LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+      return failure();
+    }
+
+    /// Clone the op, attach the layout to the result tensor descriptor, and
+    /// remove the original op.
+    OpBuilder builder(op);
+    auto *newOp = builder.clone(*op);
+    auto newTensorDescTy = xegpu::TensorDescType::get(
+        tensorDescTy.getContext(), tensorDescTy.getShape(),
+        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+    newOp->getResult(0).setType(newTensorDescTy);
+    op->replaceAllUsesWith(newOp->getResults());
+    op->erase();
+    return success();
+  }
+  /// Otherwise simply attach the layout to the op itself.
+  for (auto [i, r] : llvm::enumerate(op->getResults())) {
+    auto layoutInfo = getLayoutAttrForValue(r);
+    if (layoutInfo) {
+      auto attrName = resultLayoutNamePrefix + std::to_string(i);
+      op->setAttr(attrName, layoutInfo);
+      /// Attach the layout attribute to the users of the result.
+      assignToUsers(r, layoutInfo);
+    }
+  }
+  return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+  auto walkResult = top->walk([&](Operation *op) {
+    if (failed(assign(op)))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  if (walkResult.wasInterrupted())
+    return failure();
+
+  return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+///    HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+///    be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Helper function to get  distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If array_length >
+/// 1, that is appended to the front of the disributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+                                                      VectorType originalType) {
+  if (!layout)
+    return failure();
+
+  auto laneLayout = layout.getLaneLayout().asArrayRef();
+  assert(originalType.getShape().size() >= laneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  /// Only distribute the last `laneLayout.size()` dimensions. The remaining
+  /// dimensions are not distributed.
+  unsigned distributionStart = originalType.getRank() - laneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart) {
+      continue;
+    }
+    /// Check if the dimension can be distributed evenly.
+    if (dim % laneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / laneLayout[i - distributionStart];
+  }
+  return VectorType::get(distributedShape, originalType.getElementType());
+}
+
+/// Get the distributed vector type for a source vector type according to a
+/// xegpu::LayoutAttr.
+static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
+                                           VectorType originalType) {
+  auto shape = originalType.getShape();
+  auto distVecTyOrFailure =
+      xegpu::TensorDescType::get(shape, originalType.getElementType(),
+                                 /*array_length=*/1, /*boundary_check=*/true,
+                                 /*memory_space=*/xegpu::MemorySpace::Global,
+                                 layout)
+          .getDistributedVectorType();
+  assert(llvm::succeeded(distVecTyOrFailure) &&
+         "Failed to compute distributed vector type for the given vector type");
+  return distVecTyOrFailure.value();
+}
+
+/// Drop the layout attribute from the tensor descriptor type if layout is
+/// present.
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+  if (tensorDesc.getLayoutAttr() == xegpu::LayoutAttr())
+    return tensorDesc;
+
+  return xegpu::TensorDescType::get(
+      tensorDesc.getContext(), tensorDesc.getShape(),
+      tensorDesc.getElementType(), tensorDesc.getEncoding(),
+      xegpu::LayoutAttr());
+}
+
+/// Helper function to resolve types if the distributed type out of
+/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
+/// Example 1:
+///   distributed type: vector<8x1xf32>
+///   expected type: vector<8xf32>
+///   resolved using,
+///   %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
+/// Example 2:
+///   distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
+//    expected type: xegpu.tensor_desc<8x16xf32>
+///   resolved using,
+///   %0 = xegpu.unrealized_conversion_cast %1 :
+///   xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
+///   xegpu.tensor_desc<8x16xf32>
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+                                  PatternRewriter &rewriter) {
+  /// If orig and expected types are the same, return orig.
+  if (orig.getType() == expected)
+    return orig;
+  /// If orig is a vector type, create a shape cast op to reconcile the types.
+  if (auto origVecType = isa<VectorType>(orig.getType())) {
+    auto castOp =
+        rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+    return castOp.getResult();
+  }
+  /// If orig is a tensor descriptor type, create an unrealized conversion cast
+  /// op to reconcile the types.
+  if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+    auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+                                                              expected, orig);
+    return castOp.getResult(0);
+  }
+  llvm_unreachable("Unsupported type for reconciliation");
+  return orig;
+}
+
+/// Helper function to filter out the temporary layout attributes attached
+/// during the layout assignment process. These are not needed after going to
+/// SIMT.
+static SmallVector<NamedAttribute>
+removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (auto attr : attrs) {
+    if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+        attr.getName().strref().contains(resultLayoutNamePrefix)) {
+      continue;
+    }
+    newAttrs.push_back(attr);
+  }
+  return newAttrs;
+}
+
+/// Helper function to check if the layout is packed. Layout is packed if it is
+/// 2D and lane_data[0] != 1 (data packed from col dimension).
+static bool hasPackedLayout(xegpu::LayoutAttr layout) {
+  if (layout == xegpu::LayoutAttr())
+    return false;
+  auto laneData = layout.getLaneData();
+  if (!laneData || laneData.size() != 2)
+    return false;
+  return laneData.asArrayRef()[0] != 1;
+}
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     ...
+///     ...
+///     gpu.return %result: vector<8x16xf32>
+///   }
+/// ```
+/// To
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     %laneid = gpu.lane_id : index
+///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+///       ...
+///       ...
+///       gpu.yield %result: vector<8x16xf32>
+///     }
+///     return %0
+///   }
+struct MoveFuncBodyToWarpExecuteOnLane0
+    : public OpRewritePattern<gpu::GPUFuncOp> {
+  using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+                                PatternRewriter &rewriter) const override {
+    /// If the function only contains a single void return, skip.
+    if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+        }))
+      return failure();
+    /// If the function already moved inside a warp_execute_on_lane0, skip.
+    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::WarpExecuteOnLane0Op>(op);
+        }))
+      return failure();
+    /// Create a new function with the same signature.
+    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+    /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+    /// original gpuFuncOp.
+    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+    auto laneId = rewriter.create<gpu::LaneIdOp>(
+        newGpuFunc.getLoc(), rewriter.getIndexType(),
+        /** upperBound = **/ mlir::IntegerAttr());
+    auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+    auto &warpBodyBlock = warpOp.getBodyRegion().front();
+    /// Replace the ReturnOp of the original gpu function with a YieldOp.
+    auto origRetunOp =
+        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+    rewriter.setInsertionPointAfter(origRetunOp);
+    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+                                  origRetunOp.getOperands());
+    rewriter.eraseOp(origRetunOp);
+    /// Move the original function body to the WarpExecuteOnLane0Op body.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+                                warpOp.getBodyRegion().begin());
+    rewriter.eraseBlock(&warpBodyBlock);
+    /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+    rewriter.setInsertionPointAfter(warpOp);
+    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+    return success();
+  }
+};
+
+/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
+/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
+/// arguments. Tensor descriptor shape is not distributed because it is a
+/// uniform value accorss all work items within the subgroup. However, the
+/// layout information is dropped in the new tensor descriptor type.
+///
+/// Example:
+///
+/// ```
+///   #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32, #lo0>) {
+///     ...
+///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+///     vector.yield %td
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
+///     ...
+///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+///     vector.yield %arg0, %dead
+///   }
+///   %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+///                                 -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+    auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+
+    auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
----------------
chencha3 wrote:

what is the reason to restrict source is MemRefType? 

https://github.com/llvm/llvm-project/pull/135271