[Mlir-commits] [mlir] 9a2d3ab - [MLIR][XeGPU] Add support for cross-subgroup reduction from wg to sg (#170936)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Jan 16 07:19:28 PST 2026
Author: Nishant Patel
Date: 2026-01-16T07:19:23-08:00
New Revision: 9a2d3ab6adc41d47f96651c07cf1ce543ad1b063
URL: https://github.com/llvm/llvm-project/commit/9a2d3ab6adc41d47f96651c07cf1ce543ad1b063
DIFF: https://github.com/llvm/llvm-project/commit/9a2d3ab6adc41d47f96651c07cf1ce543ad1b063.diff
LOG: [MLIR][XeGPU] Add support for cross-subgroup reduction from wg to sg (#170936)
This PR adds support for cross-sg reduction whilst distributing from
workgroup to subgroup. It has following limitation
1. Cannot reduce to a scalar
2. For cross-sg, only 1:1 decomposition (each sg should be assigned only
one tile in the original WG tile) is supported for now. For example for
a WG tile of size 256x128, sg_layout = [8, 4], sg_data = [16, 16] wont
be supported.
Added:
Modified:
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
mlir/test/Dialect/XeGPU/invalid.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 53ca17f4f99bc..ee3a3207b271a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -213,13 +213,9 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout,
DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) {
- // A valid layout must include at least one of sg_layout and lane_layout.
- // sg_layout is essential for Workgroup layout, while lane_layout is
- // required for Subgroup layout.
- if (!sg_layout && !inst_data && !lane_layout) {
- return emitError()
- << "expected at least one of sg_layout, inst_data or lane_layout";
- }
+ // Special case for store_matrix
+ if (!sg_layout && !inst_data && !lane_layout)
+ return success();
// generate code to check sg_laout, inst_data and lane_layout having the same
// rank if they are not null.
@@ -478,15 +474,14 @@ DistributeLayoutAttr LayoutAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
LogicalResult
SliceAttr::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
xegpu::DistributeLayoutAttr parent, DenseI64ArrayAttr dims) {
- if (!parent || !dims)
- return emitError() << "expected parent layout and dims attribute";
- int64_t rank = parent.getRank();
+ if (!dims)
+ return emitError() << "expected dims attribute";
// check every element in dims is unique and smaller than rank
llvm::SmallDenseSet<int64_t> seen;
for (int64_t dim : dims.asArrayRef()) {
- if (dim < 0 || dim >= rank)
+ if (dim < 0)
return emitError() << "invalid dim (" << dim << ") in slice attribute.";
if (!seen.insert(dim).second)
return emitError() << "repeated dim (" << dim << ") in slice attribute.";
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index db31d93da663c..8328c2797be4f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1177,11 +1177,164 @@ struct WgToSgVectorShapeCastOp
}
};
-/// Pattern for lowering vector.multi_reduction op to subgroup level.
-/// Current limitation: the sg_layout in the reduced dimension being 1
-/// so that reduction is local to subgroup & no cross-subgroup communication is
-/// needed.
-/// TODO: Add cases to handle more general situations which require SLM access.
+static Value createAccumulator(ConversionPatternRewriter &rewriter,
+ Location loc, VectorType type,
+ vector::CombiningKind kind) {
+ Type elemTy = type.getElementType();
+
+ switch (kind) {
+ case vector::CombiningKind::ADD:
+ case vector::CombiningKind::XOR:
+ case vector::CombiningKind::OR:
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type, rewriter.getZeroAttr(elemTy)));
+
+ case vector::CombiningKind::MUL:
+ case vector::CombiningKind::AND:
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type, rewriter.getOneAttr(elemTy)));
+
+ case vector::CombiningKind::MINSI:
+ // Use max signed int value for signed integer min
+ if (auto intTy = dyn_cast<IntegerType>(elemTy)) {
+ auto maxVal = APInt::getSignedMaxValue(intTy.getWidth());
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type,
+ rewriter.getIntegerAttr(elemTy, maxVal)));
+ }
+ return nullptr;
+
+ case vector::CombiningKind::MINUI:
+ if (auto intTy = dyn_cast<IntegerType>(elemTy)) {
+ auto maxVal = APInt::getMaxValue(intTy.getWidth());
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type,
+ rewriter.getIntegerAttr(elemTy, maxVal)));
+ }
+ return nullptr;
+
+ case vector::CombiningKind::MAXSI:
+ if (auto intTy = dyn_cast<IntegerType>(elemTy)) {
+ auto minVal = APInt::getSignedMinValue(intTy.getWidth());
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type,
+ rewriter.getIntegerAttr(elemTy, minVal)));
+ }
+ return nullptr;
+
+ case vector::CombiningKind::MAXUI:
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type, rewriter.getZeroAttr(elemTy)));
+
+ case vector::CombiningKind::MINNUMF:
+ case vector::CombiningKind::MINIMUMF:
+ // Use +infinity for float min operations
+ if (auto floatTy = dyn_cast<FloatType>(elemTy)) {
+ auto posInf = APFloat::getInf(floatTy.getFloatSemantics());
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type, rewriter.getFloatAttr(elemTy, posInf)));
+ }
+ return nullptr;
+
+ case vector::CombiningKind::MAXNUMF:
+ case vector::CombiningKind::MAXIMUMF:
+ // Use -infinity for float max operations
+ if (auto floatTy = dyn_cast<FloatType>(elemTy)) {
+ auto negInf = APFloat::getInf(floatTy.getFloatSemantics(), true);
+ return arith::ConstantOp::create(
+ rewriter, loc, type,
+ DenseElementsAttr::get(type, rewriter.getFloatAttr(elemTy, negInf)));
+ }
+ return nullptr;
+ }
+ return nullptr;
+}
+
+/// This function converts multi-dimensional subgroup indices into a single
+/// linear offset. It's used to calculate memory offsets in SLM for
+/// cross-subgroup reduction coordination.
+///
+/// Parameters:
+/// - sgIds: Multi-dimensional subgroup indices (e.g., [sgId_x, sgId_y, sgId_z])
+/// - dims: Which dimensions to include in linearization (e.g., [0, 2] for x and
+/// z dims)
+/// - sgLayout: Subgroup layout sizes for each dimension (e.g., [4, 8, 2] means
+/// 4x8x2 subgroups)
+///
+/// It uses row-major linearization formula:
+/// offset = sum(sgIds[dim] * stride[dim])
+/// where stride[dim] = product of all sgLayout sizes in dimensions after
+/// 'dim'
+///
+/// Example:
+/// - sgLayout = [4, 8, 2], dims = [0, 2] (linearize x and z dimensions)
+/// - sgIds = [1, 3, 1] (subgroup at position x=1, y=3, z=1)
+/// - Calculation:
+/// * dim=0: stride=1, term = sgIds[0] * 1 = 1 * 1 = 1
+/// * dim=2: stride=sgLayout[0]=4, term = sgIds[2] * 4 = 1 * 4 = 4
+/// * linearizedOffset = 1 + 4 = 5
+///
+/// This gives us a unique linear index for each combination of subgroup
+/// positions in the specified dimensions, which is used for SLM row/column
+/// addressing.
+static Value linearizeSubgroupIndices(ConversionPatternRewriter &rewriter,
+ Location loc, ArrayRef<Value> sgIds,
+ ArrayRef<int64_t> dims,
+ ArrayRef<int64_t> sgLayout) {
+ Value linearizedOffset = arith::ConstantIndexOp::create(rewriter, loc, 0);
+ int64_t stride = 1;
+
+ for (int64_t dim : dims) {
+ Value dimVal = sgIds[dim];
+ Value strideVal = arith::ConstantIndexOp::create(rewriter, loc, stride);
+ Value term = arith::MulIOp::create(rewriter, loc, dimVal, strideVal);
+ linearizedOffset =
+ arith::AddIOp::create(rewriter, loc, linearizedOffset, term);
+ stride *= sgLayout[dim];
+ }
+
+ return linearizedOffset;
+}
+
+/// This pattern transforms vector.multi_dim_reduction operations from
+/// workgroup-level to subgroup-level execution with support for multiple
+/// reduction dimensions.
+///
+/// Steps include:
+/// 1. LOCAL REDUCTION :
+/// - Each subgroup performs local reduction on its data slice
+/// - Uses ZERO accumulator to avoid double-counting during cross-subgroup
+/// phase
+///
+/// 2. CROSS-SUBGROUP :
+/// - Determines if cross-subgroup reduction is needed (when sg_layout > 1 in
+/// reduction dims & sgData[reduction dims] < wgData[reduction dims])
+/// - If not needed, adds original accumulator and returns local results
+///
+/// 3. SHARED LOCAL MEMORY (SLM) PHASE (when cross-subgroup reduction needed):
+/// a) SLM Layout Design:
+/// - Rows: subgroups participating in reduction (product of sg_layout in
+/// reduction dims)
+/// - Cols: total result elements across non-reduction dimensions
+///
+/// b) Store Phase:
+/// - Each subgroup stores its local reduction result to SLM
+/// - Row offset: linearized index of subgroup in reduction dimensions
+/// - Col offset: linearized index of subgroup in non-reduction dimensions
+///
+/// c) Load and Final Reduction Phase:
+/// - Each subgroup loads a column of data (all reduction participants for
+/// its position)
+/// - Performs final reduction along the loaded dimension
+/// - Adds original accumulator to get final result
+///
struct WgToSgMultiDimReductionOp
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
@@ -1189,12 +1342,14 @@ struct WgToSgMultiDimReductionOp
LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
VectorType srcType = op.getSourceVectorType();
VectorType dstType = dyn_cast<VectorType>(op.getResult().getType());
if (!dstType)
return failure();
- auto srcShape = srcType.getShape();
+ auto originalSrcShape = srcType.getShape();
xegpu::DistributeLayoutAttr layout =
xegpu::getTemporaryLayout(dyn_cast<OpResult>(op.getResult()));
if (!layout || !layout.isForWorkgroup())
@@ -1202,39 +1357,203 @@ struct WgToSgMultiDimReductionOp
auto reductionDims = llvm::to_vector(op.getReductionDims());
- SmallVector<int64_t> sgLayout = llvm::cast<xegpu::SliceAttr>(layout)
- .getParent()
- .getEffectiveSgLayoutAsInt();
- SmallVector<int64_t> sgData = llvm::cast<xegpu::SliceAttr>(layout)
- .getParent()
- .getEffectiveSgDataAsInt();
-
- // Check that the sgLayout in the reduced dimension is 1 and
- // each sg gets the entire slice to reduce.
- for (int64_t dim : reductionDims) {
- if (sgLayout[dim] != 1 || sgData[dim] != srcShape[dim])
- return rewriter.notifyMatchFailure(
- op,
- "sgLayout in each reduced dimension must be 1 and sgData in the "
- "reduced dim must match srcShape in that dim");
+ // Get sg_layout and sg_data from the parent layout
+ SmallVector<int64_t> sgLayout;
+ SmallVector<int64_t> sgData;
+ if (auto sliceAttr = dyn_cast<xegpu::SliceAttr>(layout)) {
+ sgLayout = sliceAttr.getParent().getEffectiveSgLayoutAsInt();
+ sgData = sliceAttr.getParent().getEffectiveSgDataAsInt();
+ } else
+ return rewriter.notifyMatchFailure(
+ op, "Reduction should have SliceAttr layout");
+
+ Type elemTy = dstType.getElementType();
+
+ // Step 1: perform local subgroup reductions with ZERO accumulator
+ SmallVector<Value> localReductions;
+ SmallVector<int64_t> sgShape =
+ getSgShapeAndCount(originalSrcShape, layout).first;
+ VectorType newDstType = VectorType::get(sgShape, elemTy);
+ for (auto sgSrc : adaptor.getSource()) {
+ // Create ZERO accumulator for local reduction
+ auto neutralLocalAcc =
+ createAccumulator(rewriter, loc, newDstType, op.getKind());
+ // Local reduction with ZERO accumulator
+ auto localReduce = vector::MultiDimReductionOp::create(
+ rewriter, loc, newDstType, op.getKind(), sgSrc, neutralLocalAcc,
+ reductionDims);
+ localReductions.push_back(localReduce.getResult());
}
- SmallVector<int64_t> sgShape = getSgShapeAndCount(srcShape, layout).first;
+ // Check if cross-subgroup reduction is needed for any reduction dimension
+ SmallVector<int64_t> crossSgReductionDims;
+ for (int64_t reductionDim : reductionDims) {
+ bool needsCrossSubgroupReduction =
+ (sgLayout[reductionDim] > 1) &&
+ (sgData[reductionDim] < originalSrcShape[reductionDim]);
- VectorType newDstType =
- VectorType::get({sgShape}, dstType.getElementType());
+ if (needsCrossSubgroupReduction) {
+ crossSgReductionDims.push_back(reductionDim);
+ }
+ }
- SmallVector<Value> newReductions;
- for (auto sgSrc : adaptor.getSource()) {
- auto newOp = vector::MultiDimReductionOp::create(
- rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc,
- adaptor.getAcc()[0], op.getReductionDims());
- xegpu::setTemporaryLayout(newOp->getResult(0),
- layout.dropSgLayoutAndData());
- newReductions.push_back(newOp.getResult());
+ // If no cross-subgroup reduction needed, add accumulator and return
+ if (crossSgReductionDims.empty()) {
+ SmallVector<Value> results;
+ for (auto localResult : localReductions) {
+ auto finalResult = vector::makeArithReduction(
+ rewriter, loc, op.getKind(), localResult, adaptor.getAcc()[0]);
+ if (auto defOp = finalResult.getDefiningOp())
+ xegpu::setDistributeLayoutAttr(defOp->getResult(0),
+ layout.dropSgLayoutAndData());
+ results.push_back(finalResult);
+ }
+ rewriter.replaceOpWithMultiple(op, {results});
+ return success();
+ }
+
+ // Step 2: cross-subgroup reduction using SLM
+
+ // Calculate total elements in local result
+ int64_t localElements = computeProduct(sgShape);
+
+ // Shape cast for SLM storage - store as [1, localElements]
+ SmallVector<int64_t> storeShape2D = {1, localElements};
+ VectorType storeType2D = VectorType::get(storeShape2D, elemTy);
+ auto storeShapeCast = vector::ShapeCastOp::create(
+ rewriter, loc, storeType2D, localReductions[0]);
+ Value storeData = storeShapeCast.getResult();
+
+ // Calculate SLM shape - rows for sg's in reduction dims, cols for total
+ // result elements across all subgroups in non-reduction dimensions
+ int64_t totalReductionSubgroups = 1;
+ for (int64_t dim : crossSgReductionDims) {
+ totalReductionSubgroups *= sgLayout[dim];
}
- rewriter.replaceOpWithMultiple(op, {newReductions});
+ // Total result elements across all subgroups in non-reduction dimensions
+ int64_t totalResultElements =
+ localElements * computeProduct(sgLayout) / totalReductionSubgroups;
+
+ SmallVector<int64_t> slmShape2D = {totalReductionSubgroups,
+ totalResultElements};
+
+ // Allocate SLM
+ auto bitWidth = elemTy.getIntOrFloatBitWidth();
+ auto bytesPerElement = bitWidth / 8;
+ int64_t slmElements = slmShape2D[0] * slmShape2D[1];
+ auto slmSize = slmElements * bytesPerElement;
+ auto slmTy = MemRefType::get({slmSize}, rewriter.getI8Type(), {}, 3);
+ auto slm = memref::AllocaOp::create(rewriter, loc, slmTy);
+
+ auto memDescType = xegpu::MemDescType::get(rewriter.getContext(),
+ slmShape2D, elemTy, nullptr);
+ auto memDesc =
+ xegpu::CreateMemDescOp::create(rewriter, loc, memDescType, slm);
+
+ // Step 4: Store local results to SLM
+ auto sgId = gpu::SubgroupIdOp::create(rewriter, loc,
+ rewriter.getIndexType(), nullptr);
+
+ // Convert sgLayout to Values for delinearizeIndex
+ SmallVector<Value> sgLayoutValues;
+ for (int64_t dim : sgLayout)
+ sgLayoutValues.push_back(
+ arith::ConstantIndexOp::create(rewriter, loc, dim));
+
+ auto sgIdsResult = affine::delinearizeIndex(rewriter, loc, sgId.getResult(),
+ sgLayoutValues);
+ if (failed(sgIdsResult))
+ return failure();
+ SmallVector<Value> sgIds = *sgIdsResult;
+
+ // Row offset: linearize reduction dimension indices
+ Value rowOffsetStore = linearizeSubgroupIndices(
+ rewriter, loc, sgIds, crossSgReductionDims, sgLayout);
+
+ // Column offset: linearize non-reduction dimension indices
+ SmallVector<int64_t> nonReductionDims;
+ for (size_t i = 0; i < sgLayout.size(); ++i) {
+ if (!llvm::is_contained(reductionDims, static_cast<int64_t>(i))) {
+ nonReductionDims.push_back(static_cast<int64_t>(i));
+ }
+ }
+
+ Value colOffset = linearizeSubgroupIndices(rewriter, loc, sgIds,
+ nonReductionDims, sgLayout);
+
+ Value localElementsVal =
+ arith::ConstantIndexOp::create(rewriter, loc, localElements);
+ colOffset =
+ arith::MulIOp::create(rewriter, loc, colOffset, localElementsVal);
+
+ SmallVector<OpFoldResult> storeOffsets2D = {rowOffsetStore, colOffset};
+
+ auto storeMatrixLayout = xegpu::SliceAttr::get(
+ rewriter.getContext(),
+ xegpu::LayoutAttr::get(rewriter.getContext(), /*sg_layout =*/nullptr,
+ /*sg_data =*/nullptr,
+ /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
+ /*lane_data =*/nullptr, /*order =*/nullptr),
+ dyn_cast<xegpu::SliceAttr>(layout).getDims());
+ xegpu::StoreMatrixOp::create(rewriter, loc, storeData, memDesc.getResult(),
+ storeOffsets2D, /*layout=*/storeMatrixLayout);
+
+ gpu::BarrierOp::create(rewriter, loc);
+
+ // Step 5: Load from SLM for final reduction
+ SmallVector<int64_t> loadShape2D = {totalReductionSubgroups, localElements};
+ VectorType loadType2D = VectorType::get(loadShape2D, elemTy);
+
+ // Load offsets - each subgroup loads its column based on non-reduction
+ // position
+ Value rowOffsetLoad = arith::ConstantIndexOp::create(rewriter, loc, 0);
+
+ SmallVector<OpFoldResult> loadOffsets2D = {rowOffsetLoad, colOffset};
+
+ auto loadOp = xegpu::LoadMatrixOp::create(
+ rewriter, loc, loadType2D, memDesc.getResult(), loadOffsets2D,
+ /*layout=*/nullptr);
+
+ // Step 6: Perform final reduction with ZERO accumulator
+ SmallVector<int64_t> finalReductionDims = {0};
+ SmallVector<int64_t> finalResultShape = {localElements};
+ VectorType finalResultType = VectorType::get(finalResultShape, elemTy);
+
+ auto neutralFinalAcc =
+ createAccumulator(rewriter, loc, finalResultType, op.getKind());
+
+ auto finalReduce = vector::MultiDimReductionOp::create(
+ rewriter, loc, finalResultType, op.getKind(), loadOp.getResult(),
+ neutralFinalAcc, finalReductionDims);
+
+ // Step 7: Add the original accumulator at the end
+ Value originalAcc = adaptor.getAcc()[0];
+ Value accToAdd = originalAcc;
+
+ // Handle shape mismatch by shape casting
+ if (originalAcc.getType() != finalReduce.getResult().getType()) {
+ auto originalAccType = cast<VectorType>(originalAcc.getType());
+ auto finalResultType =
+ cast<VectorType>(finalReduce.getResult().getType());
+
+ // If they have the same number of elements, just shape cast
+ if (originalAccType.getNumElements() ==
+ finalResultType.getNumElements()) {
+ auto shapeCast = vector::ShapeCastOp::create(
+ rewriter, loc, finalResultType, originalAcc);
+ accToAdd = shapeCast.getResult();
+ }
+ }
+
+ auto finalResult = vector::makeArithReduction(
+ rewriter, loc, op.getKind(), finalReduce.getResult(), accToAdd);
+
+ if (auto defOp = finalResult.getDefiningOp())
+ xegpu::setDistributeLayoutAttr(defOp->getResult(0),
+ layout.dropSgLayoutAndData());
+
+ rewriter.replaceOp(op, finalResult);
return success();
}
};
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 67faa60f2835e..1d4953a5134d9 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -695,16 +695,6 @@ func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
gpu.return
}
-// -----
-func.func @tensor_desc_invalid_layout_attr(%src: ui64, %offsets: vector<16xindex>) {
- %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
- !xegpu.tensor_desc<16x2xf32,
- #xegpu.scatter_tdesc_attr<chunk_size = 2>,
- // expected-error at +1 {{expected at least one of sg_layout, inst_data or lane_layout}}
- #xegpu.layout<sg_data = [16, 2], lane_data = [1, 2]>>
- return
-}
-
// -----
func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
%1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
@@ -824,15 +814,6 @@ func.func @slice_attr_repeat_dim() {
return
}
-// -----
-#l = #xegpu.layout<sg_layout = [16, 1, 1], sg_data = [1, 8, 2]>
-// expected-error at +1 {{invalid dim (3) in slice attribute}}
-#s = #xegpu.slice<#l, dims = [3]>
-func.func @slice_attr_repeat_dim() {
- %offsets = arith.constant {layout_result_0 = #s} dense<0.8> : vector<16x8xindex>
- return
-}
-
// -----
func.func @create_mem_desc_non_slm() {
%m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index ad346307437e4..bc8ba0d155716 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -85,8 +85,10 @@ gpu.module @test_distribution {
%load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>}
: !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
-> vector<256x64xf32>
- // CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32>
+ // CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[C0:.*]] [1] : vector<16x64xf32> to vector<16xf32>
// CHECK-NOT: vector.multi_reduction
+ // CHECK-COUNT-2: arith.addf {{.*}}, {{.*}} : vector<16xf32>
+ // CHECK-NOT: arith.addf
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
: vector<256x64xf32> to vector<256xf32>
gpu.return
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index da6ad976d3730..ff0946d100a63 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -1,5 +1,13 @@
// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
+// CHECK-DAG: #map = affine_map<()[s0] -> (s0 floordiv 32)>
+// CHECK-DAG: #map1 = affine_map<()[s0] -> (s0 mod 32)>
+// CHECK-DAG: #map2 = affine_map<()[s0] -> (0)>
+// CHECK-DAG: #map3 = affine_map<()[s0] -> (s0 floordiv 4)>
+// CHECK-DAG: #map4 = affine_map<()[s0] -> (s0 mod 4)>
+// CHECK-DAG: #map5 = affine_map<()[s0] -> ((s0 mod 32) floordiv 16)>
+// CHECK-DAG: #map6 = affine_map<()[s0] -> (s0 mod 16)>
+// CHECK-DAG: #map7 = affine_map<()[s0] -> ((s0 mod 16) floordiv 4)>
gpu.module @test_distribution {
// CHECK-LABEL: create_nd_tdesc_no_offset
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -643,6 +651,181 @@ gpu.module @test_distribution {
gpu.return
}
+ // CHECK-LABEL: gpu.func @vector_reduce_cross_sg_dim_1
+ // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>)
+ gpu.func @vector_reduce_cross_sg_dim_1(%src: memref<?xf32>) {
+ // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1x32xf32>
+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<1x1x32xindex>
+ // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<true> : vector<1x1x32xi1>
+ // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %{{.*}}[%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref<?xf32>, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+ // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<1x32xf32>
+ // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %[[CST_2]] [1] : vector<1x1x32xf32> to vector<1x32xf32>
+ // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<1x32xf32> to vector<1x32xf32>
+ // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<4096xi8, 3>
+ // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
+ // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+ // CHECK-DAG: %[[AFFINE1:.*]] = affine.apply #map()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE2:.*]] = affine.apply #map1()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE3:.*]] = affine.apply #map2()[%[[SGID]]]
+ // CHECK-DAG: %[[MUL1:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ROW_OFFSET:.*]] = arith.addi %[[C0:.*]], %[[MUL1]] : index
+ // CHECK-DAG: %[[MUL2:.*]] = arith.muli %[[AFFINE1]], %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD1:.*]] = arith.addi %[[C0:.*]], %[[MUL2]] : index
+ // CHECK-DAG: %[[MUL3:.*]] = arith.muli %[[AFFINE3]], %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD2:.*]] = arith.addi %[[ADD1]], %[[MUL3]] : index
+ // CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD2]], %[[C32:.*]] : index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [1]>}>: vector<1x32xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+ // CHECK-DAG: gpu.barrier
+ // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<32x32xf32>
+ // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
+ // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [0] : vector<32x32xf32> to vector<32xf32>
+ // CHECK-DAG: %[[SHAPE_CAST_FINAL:.*]] = vector.shape_cast %[[CST]] : vector<1x32xf32> to vector<32xf32>
+ // CHECK-DAG: %{{.*}} = arith.addf %[[FINAL_REDUCE]], %[[SHAPE_CAST_FINAL]] : vector<32xf32>
+ // CHECK-DAG: gpu.return
+ %cst_3 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} dense<1.0> : vector<1x32xf32>
+ %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<0> : vector<1x32x32xindex>
+ %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} dense<true> : vector<1x32x32xi1>
+ %14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>} : memref<?xf32>, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32>
+ %15 = vector.multi_reduction <add>, %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32, 1], sg_data = [1, 1, 32]>, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32>
+ gpu.return
+ }
+
+ // CHECK-LABEL: gpu.func @vector_reduce_cross_sg_dim_0
+ // CHECK-SAME: (%[[ARG0:.*]]: memref<256x128xf32>)
+ gpu.func @vector_reduce_cross_sg_dim_0(%src: memref<256x128xf32>) {
+ // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
+ // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+ // CHECK-DAG: %[[REM1:.*]] = arith.remui %[[SGID]], %[[C4:.*]] : index
+ // CHECK-DAG: %[[DIV1:.*]] = arith.divui %[[SGID]], %[[C4:.*]] : index
+ // CHECK-DAG: %[[REM2:.*]] = arith.remui %[[DIV1]], %[[C8:.*]] : index
+ // CHECK-DAG: %[[MUL1:.*]] = arith.muli %[[REM2]], %[[C32:.*]] : index
+ // CHECK-DAG: %[[MUL2:.*]] = arith.muli %[[REM1]], %[[C32:.*]] : index
+ // CHECK-DAG: %[[REM3:.*]] = arith.remui %[[MUL1]], %[[C256:.*]] : index
+ // CHECK-DAG: %[[REM4:.*]] = arith.remui %[[MUL2]], %[[C128:.*]] : index
+ // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[REM3]], %[[REM4]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32>
+ // CHECK-DAG: %[[LOAD_ND:.*]] = xegpu.load_nd %[[TDESC]] : !xegpu.tensor_desc<32x32xf32> -> vector<32x32xf32>
+ // CHECK-DAG: %[[CST_LOCAL:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
+ // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_ND]], %[[CST_LOCAL]] [0] : vector<32x32xf32> to vector<32xf32>
+ // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<32xf32> to vector<1x32xf32>
+ // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<4096xi8, 3>
+ // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<4096xi8, 3> -> !xegpu.mem_desc<8x128xf32>
+ // CHECK-DAG: %[[SGID2:.*]] = gpu.subgroup_id : index
+ // CHECK-DAG: %[[AFFINE1:.*]] = affine.apply #map3()[%[[SGID2]]]
+ // CHECK-DAG: %[[AFFINE2:.*]] = affine.apply #map4()[%[[SGID2]]]
+ // CHECK-DAG: %[[MUL3:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ROW_OFFSET:.*]] = arith.addi %[[C0:.*]], %[[MUL3]] : index
+ // CHECK-DAG: %[[MUL4:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD1:.*]] = arith.addi %[[C0:.*]], %[[MUL4]] : index
+ // CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD1]], %[[C32:.*]] : index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [0]>}>: vector<1x32xf32>, !xegpu.mem_desc<8x128xf32>, index, index
+ // CHECK-DAG: gpu.barrier
+ // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<8x128xf32>, index, index -> vector<8x32xf32>
+ // CHECK-DAG: %[[CST_CROSS_SG_1:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32>
+ // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_CROSS_SG_1]] [0] : vector<8x32xf32> to vector<32xf32>
+ // CHECK-DAG: arith.addf %[[FINAL_REDUCE]], %[[CST:.*]] : vector<32xf32>
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} dense<0.0> : vector<128xf32>
+ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+ -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ %load = xegpu.load_nd %tdesc
+ : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>>
+ -> vector<256x128xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32]>, dims = [0]>} [0]
+ : vector<256x128xf32> to vector<128xf32>
+ gpu.return
+ }
+
+ // CHECK-LABEL: gpu.func @vector_reduce_multi_dim
+ // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>)
+ gpu.func @vector_reduce_multi_dim(%src: memref<?xf32>) {
+ // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32>
+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<1x1x32x32xindex>
+ // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<true> : vector<1x1x32x32xi1>
+ // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %{{.*}}[%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref<?xf32>, vector<1x1x32x32xindex>, vector<1x1x32x32xi1> -> vector<1x1x32x32xf32>
+ // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32>
+ // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %[[CST_2]] [2, 3] : vector<1x1x32x32xf32> to vector<1x1xf32>
+ // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<1x1xf32> to vector<1x1xf32>
+ // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<256xi8, 3>
+ // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<256xi8, 3> -> !xegpu.mem_desc<16x4xf32>
+ // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+ // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+ // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+ // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
+ // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+ // CHECK-DAG: %[[AFFINE1:.*]] = affine.apply #map()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE2:.*]] = affine.apply #map1()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE3:.*]] = affine.apply #map5()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE4:.*]] = affine.apply #map6()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE5:.*]] = affine.apply #map7()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE6:.*]] = affine.apply #map4()[%[[SGID]]]
+ // CHECK-DAG: %[[MUL1:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD1:.*]] = arith.addi %[[C0:.*]], %[[MUL1]] : index
+ // CHECK-DAG: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4:.*]] : index
+ // CHECK-DAG: %[[ROW_OFFSET:.*]] = arith.addi %[[ADD1]], %[[MUL2]] : index
+ // CHECK-DAG: %[[MUL3:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD2:.*]] = arith.addi %[[C0:.*]], %[[MUL3]] : index
+ // CHECK-DAG: %[[MUL4:.*]] = arith.muli {{.*}}, %[[C2:.*]] : index
+ // CHECK-DAG: %[[ADD3:.*]] = arith.addi %[[ADD2]], %[[MUL4]] : index
+ // CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD3]], %[[C1:.*]] : index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [2, 3]>}>: vector<1x1xf32>, !xegpu.mem_desc<16x4xf32>, index, index
+ // CHECK-DAG: gpu.barrier
+ // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<16x4xf32>, index, index -> vector<16x1xf32>
+ // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+ // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [0] : vector<16x1xf32> to vector<1xf32>
+ // CHECK-DAG: %[[SHAPE_CAST_FINAL:.*]] = vector.shape_cast %[[CST]] : vector<1x1xf32> to vector<1xf32>
+ // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[SHAPE_CAST_FINAL]] : vector<1xf32>
+ // CHECK-DAG: gpu.return
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} dense<0.0> : vector<2x2xf32>
+ %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<0> : vector<2x2x128x128xindex>
+ %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} dense<true> : vector<2x2x128x128xi1>
+ %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>} : memref<?xf32>, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [1, 1, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32>
+ gpu.return
+ }
+
+ // CHECK-LABEL: gpu.func @vector_reduce_multi_dim_nou_unit_local_reduction
+ // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>)
+ gpu.func @vector_reduce_multi_dim_nou_unit_local_reduction(%src: memref<?xf32>) {
+ // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32>
+ // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<16x16x32x32xindex>
+ // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<true> : vector<16x16x32x32xi1>
+ // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref<?xf32>, vector<16x16x32x32xindex>, vector<16x16x32x32xi1> -> vector<16x16x32x32xf32>
+ // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32>
+ // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %[[CST_2]] [2, 3] : vector<16x16x32x32xf32> to vector<16x16xf32>
+ // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<16x16xf32> to vector<1x256xf32>
+ // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<65536xi8, 3>
+ // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<65536xi8, 3> -> !xegpu.mem_desc<16x1024xf32>
+ // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+ // CHECK-DAG: %[[AFFINE1:.*]] = affine.apply #map()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE2:.*]] = affine.apply #map1()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE3:.*]] = affine.apply #map5()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE4:.*]] = affine.apply #map6()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE5:.*]] = affine.apply #map7()[%[[SGID]]]
+ // CHECK-DAG: %[[AFFINE6:.*]] = affine.apply #map4()[%[[SGID]]]
+ // CHECK-DAG: %[[MUL1:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD1:.*]] = arith.addi %[[C0:.*]], %[[MUL1]] : index
+ // CHECK-DAG: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4:.*]] : index
+ // CHECK-DAG: %[[ROW_OFFSET:.*]] = arith.addi %[[ADD1]], %[[MUL2]] : index
+ // CHECK-DAG: %[[MUL3:.*]] = arith.muli {{.*}}, %[[C1:.*]] : index
+ // CHECK-DAG: %[[ADD2:.*]] = arith.addi %[[C0:.*]], %[[MUL3]] : index
+ // CHECK-DAG: %[[MUL4:.*]] = arith.muli {{.*}}, %[[C2:.*]] : index
+ // CHECK-DAG: %[[ADD3:.*]] = arith.addi %[[ADD2]], %[[MUL4]] : index
+ // CHECK-DAG: %[[COL_OFFSET:.*]] = arith.muli %[[ADD3]], %[[C256:.*]] : index
+ // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]][%[[ROW_OFFSET]], %[[COL_OFFSET]]] <{layout = #xegpu.slice<#xegpu.layout<>, dims = [2, 3]>}>: vector<1x256xf32>, !xegpu.mem_desc<16x1024xf32>, index, index
+ // CHECK-DAG: gpu.barrier
+ // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]][%[[C0:.*]], %[[COL_OFFSET]]] : !xegpu.mem_desc<16x1024xf32>, index, index -> vector<16x256xf32>
+ // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<256xf32>
+ // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD_SLM]], %[[CST_3]] [0] : vector<16x256xf32> to vector<256xf32>
+ // CHECK-DAG: %[[SHAPE_CAST_FINAL:.*]] = vector.shape_cast %[[CST]] : vector<16x16xf32> to vector<256xf32>
+ // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[SHAPE_CAST_FINAL]] : vector<256xf32>
+ // CHECK-DAG: gpu.return
+ %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} dense<0.0> : vector<32x32xf32>
+ %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<0> : vector<32x32x128x128xindex>
+ %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} dense<true> : vector<32x32x128x128xi1>
+ %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>} : memref<?xf32>, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32>
+ %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [2, 2, 4, 4], sg_data = [16, 16, 32, 32]>, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32>
+ gpu.return
+ }
+
// CHECK-LABEL: load_nd_tdesc_with_anchor_layout
gpu.func @load_nd_tdesc_with_anchor_layout(%src: memref<256x128xf32>) {
//CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
More information about the Mlir-commits
mailing list