[Mlir-commits] [mlir] [MLIR][XeGPU] Extend propagation and sg_to_lane distribution pass support broadcast with low rank and scalar source input (PR #170409)
Jianhui Li
llvmlistbot at llvm.org
Tue Dec 2 18:42:58 PST 2025
https://github.com/Jianhui-Li created https://github.com/llvm/llvm-project/pull/170409
This PR extends XeGPU layout propagation and distribution for vector.broadcast operation.
It relaxes the restriction of layout propagation to allow low-rank and scalar source input, and adds a pattern in sg-to-wi distribution to support the lowering.
>From 662e38c0be162653c5f91f6b1a7d5a27e541195c Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 3 Dec 2025 00:27:14 +0000
Subject: [PATCH 1/2] broadcast 1d/scalar to 2d, propagation and sg
distribution
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 14 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 21 +++
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 26 +++-
.../Transforms/XeGPUSubgroupDistribute.cpp | 128 +++++++++++++++++-
.../Dialect/XeGPU/subgroup-distribute.mlir | 61 +++++++++
5 files changed, 244 insertions(+), 6 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 93c5187b00756..2103b169b5c00 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -283,9 +283,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
}
return true;
}]>,
- InterfaceMethod</*desc=*/[{Check if this layout is a slice of some other layout.}],
+ InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
/*retTy=*/"bool",
/*methodName=*/"isSliceOf",
+ /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>,
+
+ InterfaceMethod</*desc=*/[{Check if this layout is identical to another layout.}],
+ /*retTy=*/"bool",
+ /*methodName=*/"isIdentical",
/*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>
];
}
@@ -501,6 +506,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
/// Check if this is slice of some other layout.
bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
+
+ /// Check if this is identical to some other layout.
+ bool isIdentical(const xegpu::DistributeLayoutAttr &other);
}];
@@ -670,7 +678,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
/// Check if this is slice of some other layout.
bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
-
+
+ /// Check if this is identical to some other layout.
+ bool isIdentical(const xegpu::DistributeLayoutAttr &other);
}];
let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index fb5d1e758dbd1..efcb7f2b5e4c2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -391,6 +391,13 @@ LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
return genCoordinates(builder, loc, ids, layout, subShape, shape);
}
+bool LayoutAttr::isIdentical(const xegpu::DistributeLayoutAttr &other) {
+ if (dyn_cast<xegpu::SliceAttr>(other))
+ return false;
+
+ return *this == dyn_cast<xegpu::LayoutAttr>(other);
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_SliceAttr
//===----------------------------------------------------------------------===//
@@ -511,6 +518,20 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
[&](int64_t dim) { return thisDims.contains(dim); });
}
+bool SliceAttr::isIdentical(const xegpu::DistributeLayoutAttr &other) {
+ if (dyn_cast<xegpu::LayoutAttr>(other))
+ return false;
+
+ auto flattenedThis = flatten();
+ auto flattenedOther = dyn_cast<xegpu::SliceAttr>(other).flatten();
+
+ if ((flattenedThis.getParent() == flattenedOther.getParent()) &&
+ (flattenedThis.getDims() == flattenedOther.getDims())) {
+ return true;
+ }
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_RangeAttr
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index f2b0e71c9397f..cfa88250f9f14 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -588,9 +588,33 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
// Only consider nD -> nD broadcast.
if (sourceTy.getRank() != resultTy.getRank()) {
- broadcast.emitWarning("Expecting source and result to have same rank.");
+ // broadcast.emitWarning("Expecting source and result to have same rank.");
+
+ auto sourceDims = sourceTy.getShape();
+ auto resultDims = resultTy.getShape();
+ // adding the missing leading missing dims
+ SmallVector<int64_t> bcastDims;
+ int64_t dimDiff = resultTy.getRank() - sourceTy.getRank();
+ for (int i = 0; i < dimDiff; i++) {
+ bcastDims.push_back(i);
+ }
+
+ // for the rest dims in the resultTy, if sourceTy dim is 1, then it's
+ // broadcasted dim
+ for (size_t i = 0; i < sourceDims.size(); i++) {
+ if ((sourceDims[i] == 1) && (resultDims[i + dimDiff] != 1))
+ bcastDims.push_back(i + dimDiff);
+ }
+
+ // create a slice layout for the source
+ xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get(
+ broadcast->getContext(), cast<xegpu::LayoutAttr>(resultLayout.get()),
+ DenseI64ArrayAttr::get(broadcast->getContext(), bcastDims));
+
+ propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
return;
}
+
SetVector<int64_t> broadcastUnitDims = broadcast.computeBroadcastedUnitDims();
if (broadcastUnitDims.size() != 1) {
broadcast.emitWarning("Expecting source type to be nD vector only with "
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 0d1c5eeeff711..e06536b828385 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1424,6 +1424,128 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
}
};
+/// This pattern distributes the `vector.broadcast` operation across lanes in a
+/// warp. The pattern supports three use cases:
+///
+/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
+/// vector
+/// must have a slice layout of the result. If the distributed source and
+/// target vector types are identical, this lowers to a no-op; otherwise, it
+/// remains a broadcast but operates on distributed vectors.
+///
+/// 2) Broadcast a same-rank vector with identical layouts for source and
+/// target:
+/// The source vector must have unit dimensions, and lane_layout must be unit
+/// size for those unit dims. This always lowers to a no-op.
+///
+/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
+/// scalar to distributed result type.
+///
+/// Example 1 (lowering to a broadcast with distributed types):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+/// %0 = "some_def"() {layout_result_0 =
+/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+/// dims = [0]> } : () -> (vector<32xf32>) %2 = vector.broadcast %1
+/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+/// 1]>}: vector<32xf32> to vector<8x32xf32> gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+/// %0 = "some_def"() {layout_result_0 =
+/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+/// dims = [0]> } : () -> (vector<32xf32>) gpu.yield %0 : vector<32xf32>
+/// }
+/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
+///
+/// Example 2 (no-op):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
+/// %0 = "some_def"() {layout_result_0 =
+/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+/// dims = [1]> } : () -> (vector<8xf32>) %1 = vector.shape_cast %0
+/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+/// 1]>}: vector<8xf32> to vector<8x1xf32> %2 = vector.broadcast %1
+/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+/// 1]>}: vector<8x1xf32> to vector<8x32xf32> gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+/// %0 = "some_def"() {layout_result_0 =
+/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+/// dims = [1]> } : () -> (vector<8xf32>) %1 = vector.shape_cast %0
+/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+/// 1]>}: vector<8xf32> to vector<8x1xf32> gpu.yield %0 : vector<8x1xf32>
+/// }
+/// // The broadcast is implicit through layout transformation (no-op)
+/// %2 = vector.broadcast %r#0 : vector<8x1xf32> to vector<8x1xf32>
+/// ```
+struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *yieldOperand =
+ getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
+ if (!yieldOperand)
+ return failure();
+ auto broadcastOp =
+ cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
+ unsigned operandIdx = yieldOperand->getOperandNumber();
+
+ // Get the input layout - must be a slice layout
+ VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
+ xegpu::DistributeLayoutAttr sourceLayout =
+ xegpu::getDistributeLayoutAttr(broadcastOp.getSource());
+ if (sourceType) {
+ if (!sourceLayout || !isa<xegpu::SliceAttr>(sourceLayout))
+ return rewriter.notifyMatchFailure(
+ warpOp,
+ "Broadcast input must be scalar or have a slice layout attribute.");
+ // also the sourceLayout must be a slice of the broadcast result layout
+ xegpu::DistributeLayoutAttr resultLayout =
+ xegpu::getDistributeLayoutAttr(broadcastOp.getResult());
+ assert(resultLayout && "Broadcast result must have layout attribute.");
+ if (!sourceLayout.isSliceOf(resultLayout) ||
+ sourceLayout.isIdentical(resultLayout))
+ return rewriter.notifyMatchFailure(
+ warpOp, "Broadcast input layout must be a slice of result layout.");
+ }
+ // Get the distributed source type based on layout
+ FailureOr<VectorType> sourceDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
+ if (failed(sourceDistTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ warpOp, "Failed to distribute the source vector type.");
+
+ // Yield the source from the warp op - broadcast is a no-op
+ SmallVector<size_t> newRetIndices;
+ auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, {broadcastOp.getSource()},
+ {sourceDistTypeOrFailure.value()}, newRetIndices);
+
+ // Replace the broadcast result with the distributed source
+ Value distributedVal = newWarpOp.getResult(newRetIndices[0]);
+ Value newBroadcast = distributedVal;
+ // if sourceDistType is same as orignial warp result type, no need to
+ // re-create broadcast op
+ if (distributedVal.getType() != warpOp.getResult(operandIdx).getType()) {
+ // generate broadcast op outside warp op to have correct type
+ rewriter.setInsertionPointAfter(newWarpOp);
+ newBroadcast = vector::BroadcastOp::create(
+ rewriter, newWarpOp.getLoc(),
+ cast<VectorType>(warpOp.getResult(operandIdx).getType()),
+ distributedVal);
+ }
+
+ rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
+ return success();
+ }
+};
+
/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region.
struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
@@ -1855,9 +1977,9 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
patterns.add<CreateNdDescDistribution, StoreNdDistribution,
LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
GpuBarrierDistribution, VectorMultiReductionDistribution,
- LoadDistribution, StoreDistribution, VectorTransposeDistribution,
- VectorBitcastDistribution, LoadMatrixDistribution,
- StoreMatrixDistribution,
+ VectorBroadcastDistribution, LoadDistribution, StoreDistribution,
+ VectorTransposeDistribution, VectorBitcastDistribution,
+ LoadMatrixDistribution, StoreMatrixDistribution,
MemrefExtractAlignedPointerAsIndexDistribution>(
patterns.getContext(),
/*pattern benefit=*/regularPatternBenefit);
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 8fd3cca5594cb..e3b362f62b4f2 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -330,3 +330,64 @@ gpu.module @xevm_module{
gpu.return
}
}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
+gpu.module @xevm_module{
+ gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+ %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+ // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
+ %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+ xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case({{.*}}) {
+gpu.module @xevm_module{
+ gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+ %c0 = arith.constant 0 : index
+ %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
+ %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
+
+ %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+ %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+ // CHECK-NOT: vector.broadcast
+ // CHECK-NOT: vector.shape_cast
+
+ %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
+ // CHECK-SAME: : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+
+ xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector({{.*}}) {
+gpu.module @xevm_module{
+ gpu.func @vector_shape_cast_scalar_to_vector(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+ %c0 = arith.constant 0 : index
+ %9 = gpu.block_id x
+ %10 = arith.index_cast %9 : index to i16
+ %11 = arith.bitcast %10 : i16 to f16
+ // CHECK: vector.broadcast {{.*}} : f16 to vector<16xf16>
+ %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+ %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
+
>From a5e068b1378ee4c18565f8d17777894bf10e99b8 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 3 Dec 2025 02:30:11 +0000
Subject: [PATCH 2/2] add propagation tests
---
.../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 9 +--
mlir/test/Dialect/XeGPU/propagate-layout.mlir | 58 +++++++++++++++++++
2 files changed, 61 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index cfa88250f9f14..a36b2cc55a0ad 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -581,15 +581,12 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
// Only consider vector to vector broadcasts for now.
VectorType resultTy = broadcast.getResultVectorType();
VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
- if (!sourceTy) {
- broadcast.emitWarning("Expecting source type to be a vector type.");
+ // skip layout propagation for non-vector source operand.
+ if (!sourceTy)
return;
- }
- // Only consider nD -> nD broadcast.
+ // Hanlding broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
if (sourceTy.getRank() != resultTy.getRank()) {
- // broadcast.emitWarning("Expecting source and result to have same rank.");
-
auto sourceDims = sourceTy.getShape();
auto resultDims = resultTy.getShape();
// adding the missing leading missing dims
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index f8b59b87a122b..48e77d867508b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -640,3 +640,61 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
return
}
}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT: %[[BROADCAST:.*]] = vector.broadcast %[[REDUCE]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+func.func @vector_broadcast_1d_to_2d_broadcast_along_row(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.0000> : vector<16xf16>
+ %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = vector.multi_reduction <add>, %3, %cst [0] : vector<16x16xf16> to vector<16xf16>
+ %5 = vector.broadcast %4 : vector<16xf16> to vector<16x16xf16>
+ xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_2d_to_2d_along_column(
+// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT: %[[SHAPECAST:.*]] = vector.shape_cast %[[REDUCE]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+// CHECK-NEXT: vector.broadcast %[[SHAPECAST]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+
+func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.0000> : vector<16xf16>
+ %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
+ %5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
+ %6 = vector.broadcast %5 : vector<16x1xf16> to vector<16x16xf16>
+ xegpu.store_nd %6, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_scalar_to_vector(
+// CHECK: %[[CST:.*]] = arith.constant 0.{{.*}} : f16
+// CHECK-NEXT: %[[BROADCAST:.*]] = vector.broadcast %[[CST]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+
+func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16>) {
+ %cst = arith.constant 0.0000 : f16
+ %6 = vector.broadcast %cst : f16 to vector<16x16xf16>
+ xegpu.store_nd %6, %arg0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ return
+}
+}
\ No newline at end of file
More information about the Mlir-commits
mailing list