[Mlir-commits] [mlir] [mlir][XeGPU] add WgToSg distribution pattern for load_matrix and store_matrix. (PR #154403)
Chao Chen
llvmlistbot at llvm.org
Thu Aug 21 07:35:25 PDT 2025
================
@@ -77,6 +76,89 @@ getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
return std::make_pair(sgShape, count);
}
+/// Generates element-wise addition ops of two arrays with automatic alignment.
+/// When the input arrays have different sizes, the shorter array is
+/// right-aligned with the longer array, and the unmatched leading elements from
+/// the longer array are preserved unchanged. This is commonly used for offset
+/// computation where higher-dimensional offsets need to be added to
+/// lower-dimensional adjustments.
+///
+/// Example:
+/// lhs = [10, 20, 30], rhs = [5, 7]
+/// Result: [10, 25, 37] (20+5, 30+7, with 10 preserved)
+static SmallVector<OpFoldResult>
+genIndexAdds(ConversionPatternRewriter &rewriter, Location loc,
+ ArrayRef<OpFoldResult> lhs, ArrayRef<OpFoldResult> rhs) {
+ // ensure a is longer than b
+ ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
+ ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
+ SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
+ a = a.slice(a.size() - b.size());
+ for (auto [l, r] : llvm::zip(a, b)) {
+ auto lval = getValueOrCreateConstantIndexOp(rewriter, loc, l);
+ auto rval = getValueOrCreateConstantIndexOp(rewriter, loc, r);
+ results.push_back(rewriter.createOrFold<index::AddOp>(loc, lval, rval));
+ }
+ return results;
+}
+
+/// Utility helper for deriving a list of offsets for each sub-TensorDescs
+/// or sub-MemDescs to be accessed by current subgroup (sgId) based on the
+/// associated distribute layout attribute, the shape, subgroup id and the
+/// original offsets of the op
+template <
+ typename OpType,
+ typename = std::enable_if_t<llvm::is_one_of<
+ OpType, xegpu::CreateNdDescOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
+ xegpu::PrefetchNdOp, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>>
+static LogicalResult
+genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
+ SmallVector<SmallVector<OpFoldResult>> &offsetsList) {
+ Location loc = op.getLoc();
+ SmallVector<OpFoldResult> origOffsets = op.getMixedOffsets();
+ // not applicable to ops without offsets operands.
+ if (origOffsets.empty())
+ return failure();
+
+ // not applicable to ops without workgroup layout attributes
+ xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
+ if (!layout || !layout.isForWorkgroup())
+ return failure();
+
+ Value sgId = rewriter.create<gpu::SubgroupIdOp>(loc, /*upper_bound=*/nullptr);
+
+ // adjust the linearId if the range specifier is present
+ int64_t startOfRange = -1, endOfRange = -1;
+ bool sgIdRangeSpecified = isSgIdRangeSpecified(op, startOfRange, endOfRange);
+ if (sgIdRangeSpecified) {
+ if (layout.getNumSubgroups() != endOfRange - startOfRange)
+ return rewriter.notifyMatchFailure(
+ op, "sg_layout size must match the sg_id_range");
+ Value startOfRangeVal =
+ rewriter.create<arith::ConstantIndexOp>(loc, startOfRange);
+ sgId = rewriter.create<index::SubOp>(loc, sgId, startOfRangeVal);
+ }
+
+ // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory
+ // descriptors to be accessed, based on the layout information.
+ ArrayRef<int64_t> wgShape = op.getDistributeShape();
+ auto maybeDescOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+ if (failed(maybeDescOffsets))
+ return failure();
+
+ // Compute the final global offsets for each accessed sub-tensor
+ // or sub-memory descriptor.
+ // SmallVector<SmallVector<OpFoldResult>> offsetsList;
----------------
chencha3 wrote:
Fixed, thanks
https://github.com/llvm/llvm-project/pull/154403
More information about the Mlir-commits
mailing list