[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)
Chao Chen
llvmlistbot at llvm.org
Wed Apr 23 07:37:08 PDT 2025
================
@@ -633,6 +675,743 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// LayoutAttrAssignment
+///===----------------------------------------------------------------------===///
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+ LayoutAttrAssignment(Operation *top,
+ function_ref<LayoutInfo(Value)> getLayout)
+ : getAnalysisResult(getLayout), top(top) {}
+
+ LogicalResult run();
+
+private:
+ LogicalResult assign(Operation *op);
+ void assignToUsers(Value v, xegpu::LayoutAttr layout);
+ xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+ LogicalResult resolveConflicts();
+ function_ref<LayoutInfo(Value)>
+ getAnalysisResult; // Callable to get the layout of a value based on the
+ // layout propagation analysis.
+ Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+ for (OpOperand &user : v.getUses()) {
+ Operation *owner = user.getOwner();
+ unsigned operandNumber = user.getOperandNumber();
+ /// Use a generic name for ease of querying the layout attribute later.
+ std::string attrName =
+ operandLayoutNamePrefix + std::to_string(operandNumber);
+ owner->setAttr(attrName, layout);
+ }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+ auto layout = getAnalysisResult(v);
+ if (!layout.isAssigned())
+ return {};
+ SmallVector<int, 2> laneLayout, laneData;
+ for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+ layout.getDataAsArrayRef())) {
+ laneLayout.push_back(static_cast<int>(layout));
+ laneData.push_back(static_cast<int>(data));
+ }
+ return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+ /// For function ops, propagate the function argument layout to the users.
+ if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+ for (auto arg : func.getArguments()) {
+ auto layoutInfo = getLayoutAttrForValue(arg);
+ if (layoutInfo) {
+ assignToUsers(arg, layoutInfo);
+ }
+ }
+ return success();
+ }
+ /// If no results, move on.
+ if (op->getNumResults() == 0)
+ return success();
+ /// If all the results are scalars, move on.
+ if (llvm::all_of(op->getResultTypes(),
+ [](Type t) { return t.isIntOrIndexOrFloat(); }))
+ return success();
+ /// If the result is a tensor descriptor, attach the layout to the tensor
+ /// descriptor itself.
+ if (auto tensorDescTy =
+ dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+ auto layoutInfo = getLayoutAttrForValue(op->getResult(0));
+ if (!layoutInfo) {
+ LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+ return failure();
+ }
+
+ /// Clone the op, attach the layout to the result tensor descriptor, and
+ /// remove the original op.
+ OpBuilder builder(op);
+ auto *newOp = builder.clone(*op);
+ auto newTensorDescTy = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+ newOp->getResult(0).setType(newTensorDescTy);
+ op->replaceAllUsesWith(newOp->getResults());
+ op->erase();
+ return success();
+ }
+ /// Otherwise simply attach the layout to the op itself.
+ for (auto [i, r] : llvm::enumerate(op->getResults())) {
+ auto layoutInfo = getLayoutAttrForValue(r);
+ if (layoutInfo) {
+ auto attrName = resultLayoutNamePrefix + std::to_string(i);
+ op->setAttr(attrName, layoutInfo);
+ /// Attach the layout attribute to the users of the result.
+ assignToUsers(r, layoutInfo);
+ }
+ }
+ return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+ auto walkResult = top->walk([&](Operation *op) {
+ if (failed(assign(op)))
+ return WalkResult::interrupt();
+ return WalkResult::advance();
+ });
+
+ if (walkResult.wasInterrupted())
+ return failure();
+
+ return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+/// HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+/// be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Helper function to get distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If array_length >
+/// 1, that is appended to the front of the disributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+ VectorType originalType) {
+ if (!layout)
+ return failure();
+
+ auto laneLayout = layout.getLaneLayout().asArrayRef();
+ assert(originalType.getShape().size() >= laneLayout.size() &&
+ "Rank of the original vector type should be greater or equal to the "
+ "size of the lane layout to distribute the vector type.");
+ SmallVector<int64_t> distributedShape(originalType.getShape());
+ /// Only distribute the last `laneLayout.size()` dimensions. The remaining
+ /// dimensions are not distributed.
+ unsigned distributionStart = originalType.getRank() - laneLayout.size();
+ for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+ if (i < distributionStart) {
+ continue;
+ }
+ /// Check if the dimension can be distributed evenly.
+ if (dim % laneLayout[i - distributionStart] != 0)
+ return failure();
+ distributedShape[i] = dim / laneLayout[i - distributionStart];
+ }
+ return VectorType::get(distributedShape, originalType.getElementType());
+}
+
+/// Get the distributed vector type for a source vector type according to a
+/// xegpu::LayoutAttr.
+static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
+ VectorType originalType) {
+ auto shape = originalType.getShape();
+ auto distVecTyOrFailure =
+ xegpu::TensorDescType::get(shape, originalType.getElementType(),
+ /*array_length=*/1, /*boundary_check=*/true,
+ /*memory_space=*/xegpu::MemorySpace::Global,
+ layout)
+ .getDistributedVectorType();
+ assert(llvm::succeeded(distVecTyOrFailure) &&
+ "Failed to compute distributed vector type for the given vector type");
+ return distVecTyOrFailure.value();
+}
+
+/// Drop the layout attribute from the tensor descriptor type if layout is
+/// present.
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+ if (tensorDesc.getLayoutAttr() == xegpu::LayoutAttr())
+ return tensorDesc;
+
+ return xegpu::TensorDescType::get(
+ tensorDesc.getContext(), tensorDesc.getShape(),
+ tensorDesc.getElementType(), tensorDesc.getEncoding(),
+ xegpu::LayoutAttr());
+}
+
+/// Helper function to resolve types if the distributed type out of
+/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
+/// Example 1:
+/// distributed type: vector<8x1xf32>
+/// expected type: vector<8xf32>
+/// resolved using,
+/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
+/// Example 2:
+/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
+// expected type: xegpu.tensor_desc<8x16xf32>
+/// resolved using,
+/// %0 = xegpu.unrealized_conversion_cast %1 :
+/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
+/// xegpu.tensor_desc<8x16xf32>
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+ PatternRewriter &rewriter) {
+ /// If orig and expected types are the same, return orig.
+ if (orig.getType() == expected)
+ return orig;
+ /// If orig is a vector type, create a shape cast op to reconcile the types.
+ if (auto origVecType = isa<VectorType>(orig.getType())) {
+ auto castOp =
+ rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+ return castOp.getResult();
+ }
+ /// If orig is a tensor descriptor type, create an unrealized conversion cast
+ /// op to reconcile the types.
+ if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+ auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+ expected, orig);
+ return castOp.getResult(0);
+ }
+ llvm_unreachable("Unsupported type for reconciliation");
+ return orig;
+}
+
+/// Helper function to filter out the temporary layout attributes attached
+/// during the layout assignment process. These are not needed after going to
+/// SIMT.
+static SmallVector<NamedAttribute>
+removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> newAttrs;
+ for (auto attr : attrs) {
+ if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+ attr.getName().strref().contains(resultLayoutNamePrefix)) {
+ continue;
+ }
+ newAttrs.push_back(attr);
+ }
+ return newAttrs;
+}
+
+/// Helper function to check if the layout is packed. Layout is packed if it is
+/// 2D and lane_data[0] != 1 (data packed from col dimension).
+static bool hasPackedLayout(xegpu::LayoutAttr layout) {
+ if (layout == xegpu::LayoutAttr())
+ return false;
+ auto laneData = layout.getLaneData();
+ if (!laneData || laneData.size() != 2)
+ return false;
+ return laneData.asArrayRef()[0] != 1;
+}
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+/// ...
+/// ...
+/// gpu.return %result: vector<8x16xf32>
+/// }
+/// ```
+/// To
+/// ```
+/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+/// %laneid = gpu.lane_id : index
+/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+/// ...
+/// ...
+/// gpu.yield %result: vector<8x16xf32>
+/// }
+/// return %0
+/// }
+struct MoveFuncBodyToWarpExecuteOnLane0
+ : public OpRewritePattern<gpu::GPUFuncOp> {
+ using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+ LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+ PatternRewriter &rewriter) const override {
+ /// If the function only contains a single void return, skip.
+ if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+ }))
+ return failure();
+ /// If the function already moved inside a warp_execute_on_lane0, skip.
+ if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::WarpExecuteOnLane0Op>(op);
+ }))
+ return failure();
+ /// Create a new function with the same signature.
+ auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+ gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+ /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+ /// original gpuFuncOp.
+ rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+ auto laneId = rewriter.create<gpu::LaneIdOp>(
+ newGpuFunc.getLoc(), rewriter.getIndexType(),
+ /** upperBound = **/ mlir::IntegerAttr());
+ auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+ auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+ laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+ newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+ auto &warpBodyBlock = warpOp.getBodyRegion().front();
+ /// Replace the ReturnOp of the original gpu function with a YieldOp.
+ auto origRetunOp =
+ cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+ rewriter.setInsertionPointAfter(origRetunOp);
+ rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+ origRetunOp.getOperands());
+ rewriter.eraseOp(origRetunOp);
+ /// Move the original function body to the WarpExecuteOnLane0Op body.
+ rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+ warpOp.getBodyRegion().begin());
+ rewriter.eraseBlock(&warpBodyBlock);
+ /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+ rewriter.setInsertionPointAfter(warpOp);
+ rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+ rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+ return success();
+ }
+};
+
+/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
+/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
+/// arguments. Tensor descriptor shape is not distributed because it is a
+/// uniform value accorss all work items within the subgroup. However, the
+/// layout information is dropped in the new tensor descriptor type.
+///
+/// Example:
+///
+/// ```
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// ...
+/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+/// vector.yield %td
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
+/// ...
+/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+/// vector.yield %arg0, %dead
+/// }
+/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+/// -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+ auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+
+ auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
----------------
chencha3 wrote:
what is the reason to restrict source is MemRefType?
https://github.com/llvm/llvm-project/pull/135271
More information about the Mlir-commits
mailing list