[Mlir-commits] [mlir] f5e2238 - [MLIR][XeGPU] Enhance multi-reduction layout propagation rules (#186308)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Mar 20 08:12:38 PDT 2026
Author: Jianhui Li
Date: 2026-03-20T08:12:32-07:00
New Revision: f5e2238a3e148f266c436a82c1bbca0a353ac5af
URL: https://github.com/llvm/llvm-project/commit/f5e2238a3e148f266c436a82c1bbca0a353ac5af
DIFF: https://github.com/llvm/llvm-project/commit/f5e2238a3e148f266c436a82c1bbca0a353ac5af.diff
LOG: [MLIR][XeGPU] Enhance multi-reduction layout propagation rules (#186308)
This PR enhance the multi-reduction layout propagation:
1. improve inst_data and lane_data to support fractional subgroup size
2. improve subgroup_layout/data setup to utilize the (nested) slice
layout from consumer op
It also removes the restriction in load_matrix/store_matrix layout
propagation to allow nd (n>2) layout
Added:
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
mlir/test/Dialect/XeGPU/propagate-layout.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index ce0cce65373e5..8561226af47f6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -193,12 +193,7 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
"getRank">,
InterfaceMethod<"Get the num of effective subgroups",
"int64_t",
- "getNumSubgroups", (ins), [{
- std::optional<SmallVector<int64_t>> sgLayout = llvm::cast<ConcreteAttr>(tablegen_opaque_val).getEffectiveSgLayoutAsInt();
- if (sgLayout.has_value())
- return computeProduct(*sgLayout);
- return 0;
- }], [{}]>,
+ "getNumSubgroups">,
InterfaceMethod<"Get the order of the layout attribute",
"DenseI32ArrayAttr",
"getOrder">,
@@ -464,6 +459,13 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
return 0;
}
+ int64_t getNumSubgroups() const {
+ auto sgLayout = getEffectiveSgLayoutAsInt();
+ if (!sgLayout.empty())
+ return computeProduct(sgLayout);
+ return 0;
+ }
+
LayoutAttr dropSgLayoutAndData() const{
// avoid every field of the attribute is nullptr, which may lead to segment fault
if (!getInstData() && !getLaneLayout())
@@ -613,6 +615,10 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
return parent.getRank() - attr.getDims().size();
}
+ int64_t getNumSubgroups() const {
+ return getParent().getNumSubgroups();
+ }
+
DenseI32ArrayAttr getOrder() const {
SliceAttr attr = flatten();
auto parent = dyn_cast<LayoutAttr>(attr.getParent());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
index 314bc78a3653f..e326fe2316702 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@@ -388,7 +388,9 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
/// layout and data with the consumer's layout on non-reduction dimensions.
/// Then, it distributes remaining subgroups across reduction dimensions. This
/// avoids subgroup data redistribution overhead between the reduced result and
-/// its consumer.
+/// its consumer. When the consumer layout is a slice layout, it attempts to
+/// reuse the slice layout's parent layout for the source to further minimize
+/// potential data redistribution.
///
/// InstData requries {1, ..., min(maxReduceVectorSize, srcShape),subgroupSize}
/// Lane Layout requires {1, ..., 1, subgroupSize}
@@ -396,17 +398,34 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
///
/// Examples:
/// 1. Subgroup layout - Row reduction on 2D tensor:
-/// srcShape=[32, 64], reductionDims=[1], resShape=[32], subgroupSize=16,
+/// srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
/// workgroupSize=32
-/// Consumer Layout:
-/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
-/// [1]>} Result: srcLayout with sgLayout=[4, 8], sgData=[8, 8] (matches
-/// consumer on non-reduction dim, minimizing data redistribution on
-/// reduction dim)
-/// 2. Subgroup layout - Same example above but consumer has
diff erent layout:
-/// sgLayout=[32], sgData=[1]
-/// Result: srcLayout with sgLayout=[32,1], sgData=[1, 64]
-/// (distributes all subgroups on non reduction dim)
+/// * Consumer Layout:
+/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
+/// [1]>}
+//// * Result Layout:
+/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8],sg_data=[8, 16]>, dims =
+/// [1]>}
+/// Note that the sg_layout is reused but sg_data needs to be adjusted to
+/// evenly distribute the source tensor tile among the reduction dim.
+///
+/// 2. Subgroup layout - Same example above but consumer doesn't have a
+/// reusable slice layout.
+/// * Consumer Layout:
+/// #xegpu.layout<sgLayout=[32], sgData=[1]>
+/// * Result Layout:
+/// #xegpu.slice<#xegpu.layout<sgLayout=[32,1], sgData=[1, 64]>, dims =
+/// [1]>}
+/// * Consumer Layout:
+/// #xegpu.slice<#xegpu.layout<sgLayout=[8, 2, 4], sgData=[4, 64, 32]>,
+/// dims = [1, 2]>}
+/// * Result Layout:
+/// #xegpu.slice<#xegpu.layout<sgLayout=[8,4], sgData=[4, 32]>, dims =
+/// [1]>}
+/// Note that the consumer's layout can't be directly reused as is.
+/// So the algorithm distributes all subgroups on non reduction dimensions
+/// first and then distribute remaining subgroups on the reduction
+/// dimension.
///
/// 2. InstData layout - Column reduction:
/// srcShape=[32, 64], reductionDims=[0], subgroupSize=16
@@ -437,75 +456,90 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
return DenseI32ArrayAttr::get(context, vec32);
};
- // Extract original plain layout for workgroup/subgroup size recovery
- xegpu::SliceAttr consumerSliceLayout =
- dyn_cast<xegpu::SliceAttr>(consumerLayout);
- DistributeLayoutAttr plainLayout =
- consumerSliceLayout ? consumerSliceLayout.flatten().getParent()
- : consumerLayout;
-
+ const int workgroupSize = consumerLayout.getNumSubgroups();
const int subgroupSize = uArch->getSubgroupSize();
int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
- xegpu::DistributeLayoutAttr srcLayout;
+ SmallVector<int64_t> consumerSgLayout =
+ consumerLayout.getEffectiveSgLayoutAsInt();
+ SmallVector<int64_t> consumerLaneLayout =
+ consumerLayout.getEffectiveLaneLayoutAsInt();
+ SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
+ DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
+ xegpu::DistributeLayoutAttr srcLayout;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
- auto sgLayoutVec = plainLayout.getEffectiveSgLayoutAsInt();
- const int workgroupSize = std::accumulate(
- sgLayoutVec.begin(), sgLayoutVec.end(), 1, std::multiplies<int64_t>());
- SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank);
- SmallVector<int64_t> consumerSgLayout =
- consumerLayout.getEffectiveSgLayoutAsInt();
- int remainingSgCount = workgroupSize;
- int consumerIdx = consumerSgLayout.size() - 1;
-
- // First pass: Match consumer's layout on non-reduction dimensions
- for (int i = srcRank - 1; i >= 0; i--) {
- if (!llvm::is_contained(reductionDims, i) && consumerIdx >= 0) {
- sgLayout[i] = consumerSgLayout[consumerIdx];
- assert((srcShape[i] % sgLayout[i] == 0) &&
- "source shape not divisible by consumer sg_layout");
- sgData[i] = srcShape[i] / sgLayout[i];
- remainingSgCount /= sgLayout[i];
- consumerIdx--;
- }
- }
+ xegpu::SliceAttr consumerSliceLayout =
+ dyn_cast<xegpu::SliceAttr>(consumerLayout);
+ if (consumerSliceLayout &&
+ consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
+ srcLayout = consumerSliceLayout.getParent();
+ SmallVector<int64_t> sgLayoutFromConsumer =
+ srcLayout.getEffectiveSgLayoutAsInt();
+ auto srcSgData = computeShapeRatio(srcShape, sgLayoutFromConsumer);
+ if (srcSgData)
+ for (int dim = 0; dim < srcRank; dim++) {
+ srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
+ }
+ } else {
- // Second pass: Distribute remaining subgroups across reduction dimensions
- for (int i = srcRank - 1; i >= 0; i--) {
- if (llvm::is_contained(reductionDims, i)) {
- sgLayout[i] =
- std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));
- assert((srcShape[i] % sgLayout[i] == 0) &&
- "source shape not divisible by sg_layout");
- sgData[i] = srcShape[i] / sgLayout[i];
- remainingSgCount /= sgLayout[i];
+ SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
+ int remainingSgCount = workgroupSize;
+ int consumerIdx = 0;
+
+ // First pass: Match consumer's layout on non-reduction dimensions
+ for (int i = 0; i < srcRank; i++) {
+ if (!llvm::is_contained(reductionDims, i) &&
+ consumerIdx < static_cast<int>(consumerSgLayout.size())) {
+ sgLayout[i] = consumerSgLayout[consumerIdx];
+ assert((srcShape[i] % sgLayout[i] == 0) &&
+ "source shape not divisible by consumer sg_layout");
+ sgData[i] = srcShape[i] / sgLayout[i];
+ remainingSgCount /= sgLayout[i];
+ order[i] = consumerOrder[consumerIdx];
+ consumerIdx++;
+ }
}
- }
- assert(remainingSgCount == 1 && "not all subgroups distributed");
- srcLayout = xegpu::LayoutAttr::get(
- context, toInt32Attr(sgLayout), toInt32Attr(sgData),
- /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
- /*lane_data =*/nullptr, /*order =*/nullptr);
+ // Second pass: Distribute remaining subgroups across reduction dimensions
+ int64_t remainOrder = consumerSgLayout.size();
+ for (int i = 0; i < srcRank; i++) {
+ if (llvm::is_contained(reductionDims, i)) {
+ sgLayout[i] =
+ std::min(srcShape[i], static_cast<int64_t>(remainingSgCount));
+ assert((srcShape[i] % sgLayout[i] == 0) &&
+ "source shape not divisible by sg_layout");
+ sgData[i] = srcShape[i] / sgLayout[i];
+ remainingSgCount /= sgLayout[i];
+ order[i] = remainOrder++;
+ }
+ }
+ assert(remainingSgCount == 1 && "not all subgroups distributed");
+ srcLayout = xegpu::LayoutAttr::get(
+ context, toInt32Attr(sgLayout), toInt32Attr(sgData),
+ /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
+ /*lane_data =*/nullptr, /*order =*/
+ (!orderAttr || orderAttr.empty()) ? nullptr : toInt32Attr(order));
+ }
} else if (layoutKind == xegpu::LayoutKind::InstData) {
SmallVector<int64_t> instData(srcRank, 1);
instData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
- instData[srcRank - 1] = subgroupSize;
+ instData[srcRank - 1] =
+ std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
} else if (layoutKind == xegpu::LayoutKind::Lane) {
SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
- laneLayout[srcRank - 1] = subgroupSize;
+ laneLayout[srcRank - 1] =
+ std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
laneData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
- toInt32Attr(laneData),
- consumerLayout.getOrder());
+ toInt32Attr(laneData));
}
return xegpu::SliceAttr::get(context, srcLayout,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 82e45f813d200..2d0f1283c3e00 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -1142,7 +1142,6 @@ void LayoutInfoPropagation::visitLoadMatrixOp(
if (!hasParamsOfLayoutKind(anchorLayout)) {
VectorType resVecTy =
llvm::cast<VectorType>(loadMatrixOp.getRes().getType());
- assert(resVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
const uArch *uArch = getUArch(getChipStr(loadMatrixOp).value_or(""));
if (!uArch)
return;
@@ -1163,7 +1162,6 @@ void LayoutInfoPropagation::visitStoreMatrixOp(
} else {
VectorType srcVecTy =
llvm::cast<VectorType>(storeMatrix.getData().getType());
- assert(srcVecTy.getRank() == 2 && "Expecting 2D vector for store matrix.");
const uArch *uArch = getUArch(getChipStr(storeMatrix).value_or(""));
if (!uArch)
return;
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 503fb25deb151..5a95185c8de48 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -297,6 +297,42 @@ func.func @vector_shape_cast_expand_non_unit_dims(%arg0: memref<1024xf16>, %arg1
}
}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size(
+// CHECK: %[[ReduceVal:.*]] = vector.multi_reduction <add>, %[[Val:.*]], %[[CST:.*]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 1]>, dims = [1, 2]>} [1, 2] : vector<1x16x1xf16> to vector<1xf16>
+func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<16xi1>
+ %0 = vector.step : vector<16xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+ %2 = vector.shape_cast %1 : vector<16xf16> to vector<1x16x1xf16>
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+ %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x16x1xf16> to vector<1xf16>
+ %cst_2 = arith.constant dense<true> : vector<1xi1>
+ %cst_3 = arith.constant dense<1> : vector<1xindex>
+ xegpu.store %4, %arg1[%cst_3], %cst_2 <{layout = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [1, 2]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+ return
+ }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4x1(
+// CHECK: %[[ReduceVal:.*]] = vector.multi_reduction <add>, %[[Val:.*]], %[[CST:.*]] {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 4]>, dims = [1, 2]>} [1, 2] : vector<1x16x4xf16> to vector<1xf16>
+func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4x1(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<64xi1>
+ %0 = vector.step : vector<64xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<64xindex>, vector<64xi1> -> vector<64xf16>
+ %2 = vector.shape_cast %1 : vector<64xf16> to vector<1x16x4xf16>
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+ %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x16x4xf16> to vector<1xf16>
+ %cst_2 = arith.constant dense<true> : vector<1xi1>
+ %cst_3 = arith.constant dense<1> : vector<1xindex>
+ xegpu.store %4, %arg1[%cst_3], %cst_2 <{layout = #xegpu.slice<#xegpu.layout<inst_data = [1, 1, 16]>, dims = [1, 2]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+ return
+ }
+}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
index 9ee3de4490727..e4e6d61b92fda 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@@ -128,8 +128,7 @@ gpu.module @test {
gpu.module @test {
// CHECK-LABEL: vector_row_reduction
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
- gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes
- {known_block_size = array<i32: 1, 32, 1>} {
+ gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
@@ -144,8 +143,7 @@ gpu.module @test {
// -----
gpu.module @test {
// CHECK-LABEL: vector_nest_reduction
- gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes
- {known_block_size = array<i32: 1, 32, 1>} {
+ gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
@@ -167,21 +165,33 @@ gpu.module @test {
// -----
gpu.module @test {
- // CHECK-LABEL: broadcast_both_leadingdims_innerdims
- gpu.func @broadcast_both_leadingdims_innerdims(%arg0: memref<32x2x192xf32>, %arg1: memref<32x2x192xf32>, %arg2: memref<32x2x192xf32>) kernel attributes {known_block_size = array<i32: 768, 1, 1>, known_grid_size = array<i32: 16, 1, 1>} {
- // CHECK: arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<true> : vector<2x2x6x32xi1>
- %cst = arith.constant dense<true> : vector<2x2x6x32xi1>
- // CHECK: arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 6, 1], sg_data = [1, 1, 1, 32]>} dense<1.000000e+00> : vector<2x2x6x32xf32>
- %cst_0 = arith.constant dense<1.000000e+00> : vector<2x2x6x32xf32>
- %intptr = memref.extract_aligned_pointer_as_index %arg2 : memref<32x2x192xf32> -> index
- %0 = arith.index_cast %intptr : index to i64
- // CHECK: vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [0, 1]>, dims = [1]>} : vector<6xindex>
- %1 = vector.step : vector<6xindex>
- // CHECK: vector.shape_cast {{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 6, 1], sg_data = [1, 1, 1, 1]>, dims = [0, 1]>} : vector<6xindex> to vector<6x1xindex>
- %2 = vector.shape_cast %1 : vector<6xindex> to vector<6x1xindex>
- // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : vector<6x1xindex> to vector<2x2x6x32xindex>
- %3 = vector.broadcast %2 : vector<6x1xindex> to vector<2x2x6x32xindex>
- xegpu.store %cst_0, %0[%3], %cst <{layout = #xegpu.layout<sg_layout = [2, 2, 6, 1], sg_data = [1, 1, 1, 32]>}> : vector<2x2x6x32xf32>, i64, vector<2x2x6x32xindex>, vector<2x2x6x32xi1>
+// CHECK-LABEL: vector_nest_reduction_with_nest_slice_layout
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>, dims = [1]>} dense<0.000000e+00> : vector<32xf32>
+// CHECK: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>} dense<0.000000e+00> : vector<32x128xf32>
+// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32, #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>>
+// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>}>
+// CHECK-SAME: -> vector<32x128xf32>
+// CHECK: %[[BCAST1:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>} : vector<32x128xf32> to vector<4x32x128xf32>
+// CHECK: %[[REDUCE1:.*]] = vector.multi_reduction <add>, %[[BCAST1]], %[[CST0]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>} [0] : vector<4x32x128xf32> to vector<32x128xf32>
+// CHECK: %[[REDUCE2:.*]] = vector.multi_reduction <add>, %[[REDUCE1]], %[[CST]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 16]>, dims = [0]>, dims = [1]>} [1] : vector<32x128xf32> to vector<32xf32>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>} dense<true> : vector<32xi1>
+// CHECK: %[[OFFSET:.*]] = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>} : vector<32xindex>
+// CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
+// CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
+// CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
+ gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+ %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
+ %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
+ %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
+ %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32>
+ %bcast1 = vector.broadcast %load: vector<32x128xf32> to vector<4x32x128xf32>
+ %bcast = vector.multi_reduction <add>, %bcast1, %cst1 [0]: vector<4x32x128xf32> to vector<32x128xf32>
+ %reduce = vector.multi_reduction <add>, %bcast, %cst [1] : vector<32x128xf32> to vector<32xf32>
+ %mask = arith.constant dense<1>: vector<32xi1>
+ %offset = vector.step : vector<32xindex>
+ xegpu.store %reduce, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>} : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
gpu.return
}
}
@@ -279,13 +289,13 @@ gpu.module @test {
// -----
gpu.module @xevm_module{
// CHECK-LABEL: load_store_matrix
- gpu.func @load_store_matrix(%arg0: !xegpu.mem_desc<64x128xf32>, %sg_id_lt_2: i1) {
+ gpu.func @load_store_matrix(%arg0: !xegpu.mem_desc<1x64x128xf32>, %sg_id_lt_2: i1) {
%c0 = arith.constant 0 : index
scf.if %sg_id_lt_2 {
- // CHECK: xegpu.load_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 16]>}>
- %1 = xegpu.load_matrix %arg0[%c0, %c0] : !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
- // CHECK: xegpu.store_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 16]>}>
- xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 16]>}> : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index
+ // CHECK: xegpu.load_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [1, 4, 2], sg_data = [1, 8, 16]>}>
+ %1 = xegpu.load_matrix %arg0[%c0, %c0, %c0] : !xegpu.mem_desc<1x64x128xf32>, index, index, index -> vector<1x32x32xf32>
+ // CHECK: xegpu.store_matrix %{{.*}} <{layout = #xegpu.layout<sg_layout = [1, 4, 2], sg_data = [1, 8, 16]>}>
+ xegpu.store_matrix %1, %arg0[%c0, %c0, %c0] <{layout = #xegpu.layout<sg_layout = [1, 4, 2], sg_data = [1, 8, 16]>}> : vector<1x32x32xf32>, !xegpu.mem_desc<1x64x128xf32>, index, index, index
}
gpu.return
}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index ddd2d22108d1f..fee13b8d3e128 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -688,6 +688,89 @@ func.func @vector_shape_cast_expand_non_unit_dims(%arg0: memref<1024xf16>, %arg1
return
}
}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [1, 2]>} dense<0.000000e+00> : vector<1xf16>
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [1, 2]>} [1, 2] : vector<1x1x1xf16> to vector<1xf16>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<1> : vector<1xindex>
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<1xi1>
+ %0 = vector.step : vector<1xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+ %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+ %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x1x1xf16> to vector<1xf16>
+ %cst_2 = arith.constant dense<true> : vector<1xi1>
+ %cst_3 = arith.constant dense<1> : vector<1xindex>
+ xegpu.store %4, %arg1[%cst_3], %cst_2 : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+ return
+ }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [4], lane_data = [1]>} dense<true> : vector<4xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [4], lane_data = [1]>} : vector<4xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [4], lane_data = [1]>}> : memref<1024xf16>, vector<4xindex>, vector<4xi1> -> vector<4xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 4], lane_data = [1, 1]>} : vector<4xf16> to vector<1x4xf16>
+// CHECK: %[[ACC:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 4], lane_data = [1, 1]>, dims = [1, 2]>} dense<0.000000e+00> : vector<1xf16>
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 4], lane_data = [1, 1]>, dims = [1, 2]>} [1, 2] : vector<1x4xf16> to vector<1xf16>
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[OFF:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<1> : vector<1xindex>
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+ %cst = arith.constant dense<true> : vector<4xi1>
+ %0 = vector.step : vector<4xindex>
+ %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<4xindex>, vector<4xi1> -> vector<4xf16>
+ %2 = vector.shape_cast %1 : vector<4xf16> to vector<1x4xf16>
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
+ %4 = vector.multi_reduction <add>, %2, %cst_0 [1, 2] : vector<1x4xf16> to vector<1xf16>
+ %cst_2 = arith.constant dense<true> : vector<1xi1>
+ %cst_3 = arith.constant dense<1> : vector<1xindex>
+ xegpu.store %4, %arg1[%cst_3], %cst_2 : vector<1xf16>, memref<16xf16>, vector<1xindex>, vector<1xi1>
+ return
+ }
+}
+
+// -----
+gpu.module @test {
+// CHECK: func.func @vector_reduction_broadcast_transpose(%[[ARG0:.*]]: memref<1024x64xf32>, %[[ARG1:.*]]: memref<1024x64xf32>)
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[CST_0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0xFF800000> : vector<1x16xf32>
+// CHECK: %[[CST_1:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>, dims = [0]>} dense<0.000000e+00> : vector<8xf32>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[CST_0]], %[[CST]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<1x16xf32> to vector<1xf32>
+// CHECK: %[[INS:.*]] = vector.insert_strided_slice %[[RED]], %[[CST_1]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>, dims = [0]>, offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
+// CHECK: %[[ADD:.*]] = arith.addf %[[INS]], %[[CST_1]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>, dims = [0]>} : vector<8xf32>
+// CHECK: %[[BC:.*]] = vector.broadcast %[[ADD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>} : vector<8xf32> to vector<16x8xf32>
+// CHECK: %[[TR:.*]] = vector.transpose %[[BC]], [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x8xf32> to vector<8x16xf32>
+// CHECK: %[[DESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x64xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[TR]], %[[DESC]][%[[C0]], %[[C0]]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+func.func @vector_reduction_broadcast_transpose(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) {
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant dense<0xFF800000> : vector<1x16xf32>
+ %cst_1 = arith.constant dense<0.000000e+00> : vector<8xf32>
+ %c0 = arith.constant 0 : index
+ %100 = vector.multi_reduction <add>, %cst_0, %cst [1] : vector<1x16xf32> to vector<1xf32>
+ %157 = vector.insert_strided_slice %100, %cst_1 {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
+ %165 = arith.addf %157, %cst_1 : vector<8xf32>
+ %166 = vector.broadcast %165 : vector<8xf32> to vector<16x8xf32>
+ %168 = vector.transpose %166, [1, 0] : vector<16x8xf32> to vector<8x16xf32>
+ %172 = xegpu.create_nd_tdesc %arg1 : memref<1024x64xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+ xegpu.store_nd %168, %172[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+ return
+ }
+}
+
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_shape_cast_expand_and_merge(
More information about the Mlir-commits
mailing list