[Mlir-commits] [mlir] [mlir][xegpu] Add SIMT distribution patterns for UpdateNdOffset and PrefetchNd ops. (PR #138033)
Chao Chen
llvmlistbot at llvm.org
Tue May 6 13:47:06 PDT 2025
================
@@ -1412,6 +1431,150 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
}
};
+/// Sink an update_nd_offset op feeding into yield op of an enclosing
+/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
+/// original op that will not be used by the yield op (and should be cleaned
+/// up later). The yield op will bypass the updateOp's arguments. The tensor
+/// descriptor type is not distributed. Appropriate cast ops are inserted if
+/// the distributed types does not match expected xegpu SIMT types.
+/// Example:
+/// ```
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// ...
+/// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
+/// !xegpu.tensor_desc<4x8xf32, #lo0>
+/// gpu.yield %update
+/// }
+/// ...
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// ...
+/// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
+/// !xegpu.tensor_desc<4x8xf32, #lo0> gpu.yield %dead, %arg0
+/// gup.yield %dead, %arg0, %c32, %c16
+/// }
+/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+/// #lo0> -> !xegpu.tensor_desc<4x8xf32>
+/// %1 = xegpu.update_nd_offset %0, [%c32, %c16]:
+/// !xegpu.tensor_desc<4x8xf32>
+/// ...
+/// ```
+struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::UpdateNdOffset op");
+ auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+ // new update op does not have layout attribute.
+ xegpu::TensorDescType newTensorDescTy =
+ dropLayouts(updateOp.getTensorDescType());
+
+ SmallVector<Value, 3> newYieldValues;
+ SmallVector<Type, 3> newYieldTypes;
+ for (Value operand : updateOp->getOperands()) {
+ newYieldValues.push_back(operand);
+ if (isa<xegpu::TensorDescType>(operand.getType())) {
+ newYieldTypes.push_back(newTensorDescTy);
+ } else {
+ newYieldTypes.push_back(operand.getType());
+ }
+ }
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+ rewriter.setInsertionPointAfter(newWarpOp);
+ SmallVector<Value> newUpdateOperands;
+ for (size_t i : newRetIndices) {
+ // For the tensor descriptor operand, the layout attribute is dropped
+ // after distribution. Types needs to be resolved in this case.
----------------
chencha3 wrote:
I see. Thanks.
https://github.com/llvm/llvm-project/pull/138033
More information about the Mlir-commits
mailing list