[Mlir-commits] [mlir] [MLIR][XeGPU] Preserve leading unit dimension during blocking (PR #180884)
Jianhui Li
llvmlistbot at llvm.org
Thu Feb 12 09:34:20 PST 2026
https://github.com/Jianhui-Li updated https://github.com/llvm/llvm-project/pull/180884
>From 17d9b0dd3a88f52f40d0041451a84c7c1509ed12 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 10 Feb 2026 23:03:23 +0000
Subject: [PATCH 1/7] allow leading dim for scatter load in blocking and sg
distribution
---
.../XeGPU/Transforms/XeGPUBlocking.cpp | 8 +-
.../Transforms/XeGPUSubgroupDistribute.cpp | 88 ++++++++++++++-----
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 7 ++
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 15 ++++
4 files changed, 96 insertions(+), 22 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 6faa25cf49df9..8b8647fbc8757 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -162,13 +162,19 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
ownerOp &&
(isa<xegpu::CreateNdDescOp, xegpu::DpasOp, xegpu::ConvertLayoutOp,
xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::AtomicRMWOp,
- xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
+ xegpu::LoadGatherOp, xegpu::StoreScatterOp, xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
vector::TransposeOp, vector::ShapeCastOp,
vector::MultiDimReductionOp, vector::BroadcastOp>(ownerOp));
if (!skipLeadingUnitDimRemoval) {
auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
instData.erase(instData.begin(), it);
}
+
+ //print instData for debugging
+ llvm::errs() << "Inst data for value " << value << " in op " << *ownerOp << ": ";
+ for (auto dim : instData) llvm::errs() << dim << " ";
+ llvm::errs() << "\n";
+
return instData;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index aa1dfaa9e0fda..25462a93e5393 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1058,6 +1058,15 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
/// To
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+///
+/// Note that the load distribution pattern also handles leading unit dimensions
+/// in the payload vector to support cases where the load is distributed by
+/// the warp op with unit dimensions added to the front of the vector. In this
+/// case the load distribution will only change the dimensions corresponding to
+/// the SG distribution and keep the leading unit dimensions unchanged. For
+/// example, a load with result type vector<1x16xf16> distributed by the warp op
+/// with layout expecting vector<1x16xf16> will be transformed to have result
+/// type vector<1x1xf16>.
struct LoadDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -1082,19 +1091,23 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
"Load op must have a vector arguments for offsets and mask");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
- if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
- return rewriter.notifyMatchFailure(loadGatherOp,
- "Expected 1D offsets and mask vector");
- // Assume offset and mask producers will be distributed as well.
- std::string layoutOffsetsName =
- xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(1));
- std::string layoutMaskName =
- xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(2));
+ VectorType resultVecTy =
+ cast<VectorType>(loadGatherOp.getResult().getType());
+
+ // add handling leading unit dimensions support
+ int chunkSize = loadGatherOp.getChunkSize().value_or(1);
+ int effectiveVecRank = chunkSize > 1 ? 1 : 2;
+ for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
+ if (resultVecTy.getShape()[i] != 1) {
+ return rewriter.notifyMatchFailure(
+ loadGatherOp, "Only unit dimensions allowed for the leading "
+ "dimensions of the load vector!");
+ }
+ }
- xegpu::LayoutAttr layoutOffsets =
- loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
- xegpu::LayoutAttr layoutMask =
- loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
+ auto layoutOffsets =
+ xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
+ auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
@@ -1109,26 +1122,59 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = loadGatherOp->getOperands();
- SmallVector<Type> operandTypesToYield = {
- operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
- distMaskByWarpOpOrFailure.value()};
const unsigned operandIdx = producedByLastLoad->getOperandNumber();
VectorType distResultTy =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
- // Distributed load op will always be 1D.
- VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()},
- distResultTy.getElementType());
+ VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
+ VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
+
+ SmallVector<Type> operandTypesToYield = {operands[0].getType(),
+ distOffsetsTy, distMaskTy};
+
+ // Debug print
+ llvm::errs() << "LoadDistribution: operands.size() = " << operands.size()
+ << "\n";
+ llvm::errs() << "LoadDistribution: operandTypesToYield.size() = "
+ << operandTypesToYield.size() << "\n";
+ for (size_t i = 0; i < operands.size(); ++i) {
+ llvm::errs() << " operand[" << i << "] type: " << operands[i].getType()
+ << "\n";
+ }
+ for (size_t i = 0; i < operandTypesToYield.size(); ++i) {
+ llvm::errs() << " operandTypesToYield[" << i
+ << "]: " << operandTypesToYield[i] << "\n";
+ }
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
- SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+ // SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
+ // newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
rewriter.setInsertionPointAfter(newWarpOp);
+
+ // Distributed load op will always be 1D.
+ VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
+ distResultTy.getElementType());
+
+ VectorType distOffsetsTy1D =
+ VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
+ distOffsetsByWarpOpOrFailure.value().getElementType());
+ VectorType distMaskTy1D =
+ VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
+ distMaskByWarpOpOrFailure.value().getElementType());
+
+ Value distOffsetVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
+ Value distmaskVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
+
+ SmallVector<Value> newLoadGatherOperands = {
+ newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
+
xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
- rewriter, newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
+ rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
loadGatherOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 2b1bd4d73a576..792fe12b88397 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -728,6 +728,13 @@ struct UnrollStoreScatterOpWithOffsets
return failure();
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ //print target shape for debugging
+ llvm::errs() << "Target shape: ";
+ if (targetShape) {
+ for (auto dim : *targetShape)
+ llvm::errs() << dim << " ";
+ }
+ llvm::errs() << "\n";
if (!targetShape)
return failure();
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index c47fd92fe46d7..fb5dff8981c9d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -390,11 +390,26 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
SmallVector<int64_t> staticStrides(offsets.size(), 1);
+
+ // Debug print
+ llvm::errs() << "Extracting slice with offsets: [";
+ for (size_t i = 0; i < offsets.size(); ++i) {
+ llvm::errs() << offsets[i];
+ if (i + 1 < offsets.size()) llvm::errs() << ", ";
+ }
+ llvm::errs() << "], shape: [";
+ for (size_t i = 0; i < adjustedTargetShape.size(); ++i) {
+ llvm::errs() << adjustedTargetShape[i];
+ if (i + 1 < adjustedTargetShape.size()) llvm::errs() << ", ";
+ }
+ llvm::errs() << "]\n";
+
Value slice = vector::ExtractStridedSliceOp::create(
builder, loc, value, offsets, adjustedTargetShape, staticStrides);
// Reshape to remove leading unit dims if needed
if (srcShapeRank > targetShapeRank) {
+ llvm::errs() << "Reshaping from rank " << srcShapeRank << " to rank " << targetShapeRank << "\n";
auto targetTy = VectorType::get(shape, vecTy.getElementType());
slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
}
>From 21517cf3bab285af7bf50f940fe5e16453f75e54 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Tue, 10 Feb 2026 23:29:22 +0000
Subject: [PATCH 2/7] minor fix of chunk_size
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 17 +----------------
1 file changed, 1 insertion(+), 16 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 25462a93e5393..4ec01d4517a65 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1093,10 +1093,9 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
VectorType resultVecTy =
cast<VectorType>(loadGatherOp.getResult().getType());
-
// add handling leading unit dimensions support
int chunkSize = loadGatherOp.getChunkSize().value_or(1);
- int effectiveVecRank = chunkSize > 1 ? 1 : 2;
+ int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
if (resultVecTy.getShape()[i] != 1) {
return rewriter.notifyMatchFailure(
@@ -1132,20 +1131,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
SmallVector<Type> operandTypesToYield = {operands[0].getType(),
distOffsetsTy, distMaskTy};
- // Debug print
- llvm::errs() << "LoadDistribution: operands.size() = " << operands.size()
- << "\n";
- llvm::errs() << "LoadDistribution: operandTypesToYield.size() = "
- << operandTypesToYield.size() << "\n";
- for (size_t i = 0; i < operands.size(); ++i) {
- llvm::errs() << " operand[" << i << "] type: " << operands[i].getType()
- << "\n";
- }
- for (size_t i = 0; i < operandTypesToYield.size(); ++i) {
- llvm::errs() << " operandTypesToYield[" << i
- << "]: " << operandTypesToYield[i] << "\n";
- }
-
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
>From bbc3d41cd679079061de22ffe80a0d5d9edf870e Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 11 Feb 2026 04:42:23 +0000
Subject: [PATCH 3/7] preserve leading dimension during blocking
---
.../XeGPU/Transforms/XeGPUBlocking.cpp | 31 -----
.../Transforms/XeGPUSubgroupDistribute.cpp | 109 ++++++++++--------
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 68 +++++------
3 files changed, 95 insertions(+), 113 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 8b8647fbc8757..4eb6ad51ee9bf 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -152,32 +152,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
if (layout && layout.isForSubgroup()) {
if (!layout.getEffectiveInstDataAsInt().empty()) {
SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
- // Remove leading unit dimensions from inst_data for non-rank-sensitive
- // ops. For example, if the inst_data is [1, 1, 32] it will pass [32] as
- // the unroll/blocking size.
- // Skip it for rank-sensitive ops, whose semantics depend on the tensor
- // rank (and consequently its shape), and therefore must not alter the
- // input tile rank or shape, such as by dropping leading dimensions.
- bool skipLeadingUnitDimRemoval =
- ownerOp &&
- (isa<xegpu::CreateNdDescOp, xegpu::DpasOp, xegpu::ConvertLayoutOp,
- xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::AtomicRMWOp,
- xegpu::LoadGatherOp, xegpu::StoreScatterOp, xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
- vector::TransposeOp, vector::ShapeCastOp,
- vector::MultiDimReductionOp, vector::BroadcastOp>(ownerOp));
- if (!skipLeadingUnitDimRemoval) {
- auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
- instData.erase(instData.begin(), it);
- }
-
- //print instData for debugging
- llvm::errs() << "Inst data for value " << value << " in op " << *ownerOp << ": ";
- for (auto dim : instData) llvm::errs() << dim << " ";
- llvm::errs() << "\n";
-
return instData;
}
-
if (auto type = dyn_cast<ShapedType>(value.getType()))
return llvm::to_vector(type.getShape());
}
@@ -356,13 +332,6 @@ void XeGPUBlockingPass::runOnOperation() {
xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter);
- // Remove leading unit dimensions from vector ops and then
- // do the unrolling.
- {
- RewritePatternSet patterns(ctx);
- vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
- (void)applyPatternsGreedily(op, std::move(patterns));
- }
xegpu::UnrollOptions options;
options.setFilterConstraint(
[&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); });
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4ec01d4517a65..ce5f4f887e910 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -778,6 +778,15 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
/// To
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+///
+/// Note that the store distribution pattern also handles leading unit
+/// dimensions in the payload, mask and offsets vectors. In this case the store
+/// distribution will only change the dimensions corresponding to the SG
+/// distribution and keep the leading unit dimensions unchanged.
+/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
+/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
+/// offset/mask/payload when necessary so that the distributed store is workign
+/// on 1D shape vector to match the HW capability.
struct StoreDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -792,30 +801,27 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
storeScatterOp, "Store op must have a vector of offsets argument");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
- if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
- return rewriter.notifyMatchFailure(storeScatterOp,
- "Expected 1D offsets and mask vector");
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
- if (storeVecTy.getRank() > 2)
- return rewriter.notifyMatchFailure(
- storeScatterOp, "Expected at most 2D result at SG level");
-
- std::string layoutPayloadName =
- xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(0));
- std::string layoutOffsetsName =
- xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(2));
- std::string layoutMaskName =
- xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3));
-
- xegpu::DistributeLayoutAttr layoutPayload =
- storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
- layoutPayloadName);
- xegpu::DistributeLayoutAttr layoutOffsets =
- storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
- layoutOffsetsName);
- xegpu::DistributeLayoutAttr layoutMask =
- storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
- layoutMaskName);
+
+ // Add handling for leading unit dimensions support
+ int chunkSize = storeScatterOp.getChunkSize().value_or(1);
+ int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
+
+ // Check that all leading dimensions are unit dimensions
+ for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
+ if (storeVecTy.getShape()[i] != 1) {
+ return rewriter.notifyMatchFailure(
+ storeScatterOp, "Only unit dimensions allowed for the leading "
+ "dimensions of the store vector!");
+ }
+ }
+
+ auto layoutPayload =
+ xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0));
+ auto layoutOffsets =
+ xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
+ auto layoutMask =
+ xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
@@ -830,29 +836,42 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
storeScatterOp,
"Some vector operands have no layouts, using defaults instead.");
}
- // Distributed store payload type according to the lane layout.
- VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value();
- // Expected distributed payload type is always 1D.
- VectorType expectedPayloadTy =
- VectorType::get({distPayloadTyByWarpOp.getNumElements()},
- distPayloadTyByWarpOp.getElementType());
+
+ VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
+ VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
+ VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
SmallVector<Type> operandTypesToYield = {
- distPayloadTyByWarpOp, operands[1].getType(),
- distOffsetsByWarpOpOrFailure.value(),
- distMaskByWarpOpOrFailure.value()};
+ distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
- SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
- // The payload operand may need type adjustment due to mismatch between warp
- // distributed type and expected SIMT type.
+
rewriter.setInsertionPointAfter(newWarpOp);
- newStoreScatterOpOperands[0] = resolveDistributedTy(
- newStoreScatterOpOperands[0], expectedPayloadTy, rewriter);
+
+ // Distributed store payload type is always 1D without leading unit dims
+ VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
+ distPayloadTy.getElementType());
+
+ VectorType distOffsetsTy1D = VectorType::get(
+ {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
+ VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
+ distMaskTy.getElementType());
+
+ // Resolve distributed types to 1D for SIMT execution
+ Value distPayloadVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
+ Value distOffsetVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
+ Value distMaskVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);
+
+ SmallVector<Value> newStoreScatterOpOperands = {
+ distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
+ distMaskVal};
+
xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
@@ -1060,13 +1079,13 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
///
/// Note that the load distribution pattern also handles leading unit dimensions
-/// in the payload vector to support cases where the load is distributed by
-/// the warp op with unit dimensions added to the front of the vector. In this
-/// case the load distribution will only change the dimensions corresponding to
-/// the SG distribution and keep the leading unit dimensions unchanged. For
-/// example, a load with result type vector<1x16xf16> distributed by the warp op
-/// with layout expecting vector<1x16xf16> will be transformed to have result
-/// type vector<1x1xf16>.
+/// in the payload, mask, and offsets vector.The load distribution will only
+/// change the dimensions corresponding to the SG distribution and keep the
+/// leading unit dimensions unchanged. For example, a load with result type
+/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
+/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
+/// offset/mask/payload when necessary so that the distributed load is workign
+/// on 1D shape vector to match the HW capability.
struct LoadDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 68f6e8e1ec955..e80a9144b9674 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -740,17 +740,17 @@ gpu.module @test_kernel {
// -----
gpu.module @test_kernel {
- // CHECK-LABEL: remove_unit_dim_inst_data
+ // CHECK-LABEL: preserve_unit_dim_of_load_inst_data
// CHECK-SAME: [[arg0:%.+]]: ui64
// CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32>
- // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<16xi1>
- // CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
- // CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
- // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
- // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
- // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
- // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
- gpu.func @remove_unit_dim_inst_data(%src: ui64) -> vector<1x1x32xf32> {
+ // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<1x1x16xi1>
+ // CHECK: [[cst_1:%.+]] = arith.constant dense<{{.*}}> : vector<1x1x16xindex>
+ // CHECK: [[cst_2:%.+]] = arith.constant dense<{{.*}}> : vector<1x1x16xindex>
+ // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
+ // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
+ gpu.func @preserve_unit_dim_of_load_inst_data(%src: ui64) -> vector<1x1x32xf32> {
%cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[
[0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
@@ -770,8 +770,6 @@ gpu.module @test_kernel {
gpu.module @test_kernel {
// CHECK-LABEL: load_store_nd_with_offsets
// CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32>
- // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32xf32>
- // CHECK-DAG: [[cst_0:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32>
// CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
// CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
// CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
@@ -779,27 +777,12 @@ gpu.module @test_kernel {
// CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
// CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
// CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
- // CHECK: [[ins_a0:%.+]] = vector.insert_strided_slice [[ld_a0]], [[cst_0]] {offsets = [0, 0], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
- // CHECK: [[ins_a1:%.+]] = vector.insert_strided_slice [[ld_a1]], [[ins_a0]] {offsets = [0, 16], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
// CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
// CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
- // CHECK: [[ins_b0:%.+]] = vector.insert_strided_slice [[ld_b0]], [[cst_0]] {offsets = [0, 0], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
- // CHECK: [[ins_b1:%.+]] = vector.insert_strided_slice [[ld_b1]], [[ins_b0]] {offsets = [0, 16], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
- // CHECK: [[ext_a:%.+]] = vector.extract [[ins_a1]][0] : vector<32xf32> from vector<1x32xf32>
- // CHECK: [[ext_b:%.+]] = vector.extract [[ins_b1]][0] : vector<32xf32> from vector<1x32xf32>
- // CHECK: [[slice_a0:%.+]] = vector.extract_strided_slice [[ext_a]] {offsets = [0], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32>
- // CHECK: [[slice_b0:%.+]] = vector.extract_strided_slice [[ext_b]] {offsets = [0], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32>
- // CHECK: [[add0:%.+]] = arith.addf [[slice_a0]], [[slice_b0]] : vector<16xf32>
- // CHECK: [[ins_add0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0], strides = [1]} : vector<16xf32> into vector<32xf32>
- // CHECK: [[slice_a1:%.+]] = vector.extract_strided_slice [[ext_a]] {offsets = [16], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32>
- // CHECK: [[slice_b1:%.+]] = vector.extract_strided_slice [[ext_b]] {offsets = [16], sizes = [16], strides = [1]} : vector<32xf32> to vector<16xf32>
- // CHECK: [[add1:%.+]] = arith.addf [[slice_a1]], [[slice_b1]] : vector<16xf32>
- // CHECK: [[ins_add1:%.+]] = vector.insert_strided_slice [[add1]], [[ins_add0]] {offsets = [16], strides = [1]} : vector<16xf32> into vector<32xf32>
- // CHECK: [[broadcast:%.+]] = vector.broadcast [[ins_add1]] : vector<32xf32> to vector<1x32xf32>
- // CHECK: [[ext_result0:%.+]] = vector.extract_strided_slice [[broadcast]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
- // CHECK: [[ext_result1:%.+]] = vector.extract_strided_slice [[broadcast]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
- // CHECK: xegpu.store_nd [[ext_result0]], [[tdesc_c]][[[c0]], [[c0]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
- // CHECK: xegpu.store_nd [[ext_result1]], [[tdesc_c]][[[c0]], [[c16]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
+ // CHECK: [[add0:%.+]] = arith.addf [[ld_a0]], [[ld_b0]] : vector<1x16xf32>
+ // CHECK: [[add1:%.+]] = arith.addf [[ld_a1]], [[ld_b1]] : vector<1x16xf32>
+ // CHECK: xegpu.store_nd [[add0]], [[tdesc_c]][[[c0]], [[c0]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
+ // CHECK: xegpu.store_nd [[add1]], [[tdesc_c]][[[c0]], [[c16]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
gpu.func @load_store_nd_with_offsets(%A: memref<1024x1024xf32>, %B: memref<1024x1024xf32>, %C: memref<1024x1024xf32>) {
%c0 = arith.constant 0 : index
@@ -817,16 +800,27 @@ gpu.module @test_kernel {
}
// -----
-#inst_data = #xegpu.layout<inst_data = [1, 1, 32]>
+#inst_data = #xegpu.layout<inst_data = [1, 1, 16]>
gpu.module @test_kernel {
// CHECK-LABEL: load_add_store_leading_unit_dims
// CHECK-SAME: [[arg0:%.+]]: ui64, [[arg1:%.+]]: ui64, [[arg2:%.+]]: ui64
- // CHECK: [[mask:%.+]] = arith.constant dense<true> : vector<32xi1>
- // CHECK: [[offsets:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<32xindex>
- // CHECK: [[a:%.+]] = xegpu.load [[arg0]][[[offsets]]], [[mask]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
- // CHECK: [[b:%.+]] = xegpu.load [[arg1]][[[offsets]]], [[mask]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
- // CHECK: [[add:%.+]] = arith.addf [[a]], [[b]] : vector<32xf32>
- // CHECK: xegpu.store [[add]], [[arg2]][[[offsets]]], [[mask]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
+ // CHECK: [[c0:%.+]] = arith.constant dense<true> : vector<1x1x16xi1>
+ // CHECK: [[c1:%.+]] = arith.constant dense<[{{\[\[}}0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]]]> : vector<1x1x16xindex>
+ // CHECK: [[c2:%.+]] = arith.constant dense<[{{\[\[}}128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]]]> : vector<1x1x16xindex>
+ // CHECK: [[v0:%.+]] = xegpu.load [[arg0]]{{\[}}[[c1]]], [[c0]]
+ // CHECK-SAME: ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[v1:%.+]] = xegpu.load [[arg0]]{{\[}}[[c2]]], [[c0]]
+ // CHECK-SAME: ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[v2:%.+]] = xegpu.load [[arg1]]{{\[}}[[c1]]], [[c0]]
+ // CHECK-SAME: ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[v3:%.+]] = xegpu.load [[arg1]]{{\[}}[[c2]]], [[c0]]
+ // CHECK-SAME: ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[v4:%.+]] = arith.addf [[v0]], [[v2]] : vector<1x1x16xf32>
+ // CHECK: [[v5:%.+]] = arith.addf [[v1]], [[v3]] : vector<1x1x16xf32>
+ // CHECK: xegpu.store [[v4]], [[arg2]]{{\[}}[[c1]]], [[c0]]
+ // CHECK-SAME: vector<1x1x16xf32>, ui64, vector<1x1x16xindex>, vector<1x1x16xi1>
+ // CHECK: xegpu.store [[v5]], [[arg2]]{{\[}}[[c2]]], [[c0]]
+ // CHECK-SAME: vector<1x1x16xf32>, ui64, vector<1x1x16xindex>, vector<1x1x16xi1>
gpu.func @load_add_store_leading_unit_dims(%A: ui64, %B: ui64, %C: ui64) {
%cst = arith.constant {layout_result_0 = #inst_data} dense<[
[[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
>From bd81a4bfc1da3ca412ee59ed0f6c6c1b8988c9ec Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 11 Feb 2026 04:45:10 +0000
Subject: [PATCH 4/7] clean up
---
.../lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 7 -------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 16 +---------------
2 files changed, 1 insertion(+), 22 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 792fe12b88397..2b1bd4d73a576 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -728,13 +728,6 @@ struct UnrollStoreScatterOpWithOffsets
return failure();
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- //print target shape for debugging
- llvm::errs() << "Target shape: ";
- if (targetShape) {
- for (auto dim : *targetShape)
- llvm::errs() << dim << " ";
- }
- llvm::errs() << "\n";
if (!targetShape)
return failure();
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index fb5dff8981c9d..fa5810ad7f828 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -390,26 +390,12 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
SmallVector<int64_t> staticStrides(offsets.size(), 1);
-
- // Debug print
- llvm::errs() << "Extracting slice with offsets: [";
- for (size_t i = 0; i < offsets.size(); ++i) {
- llvm::errs() << offsets[i];
- if (i + 1 < offsets.size()) llvm::errs() << ", ";
- }
- llvm::errs() << "], shape: [";
- for (size_t i = 0; i < adjustedTargetShape.size(); ++i) {
- llvm::errs() << adjustedTargetShape[i];
- if (i + 1 < adjustedTargetShape.size()) llvm::errs() << ", ";
- }
- llvm::errs() << "]\n";
-
+
Value slice = vector::ExtractStridedSliceOp::create(
builder, loc, value, offsets, adjustedTargetShape, staticStrides);
// Reshape to remove leading unit dims if needed
if (srcShapeRank > targetShapeRank) {
- llvm::errs() << "Reshaping from rank " << srcShapeRank << " to rank " << targetShapeRank << "\n";
auto targetTy = VectorType::get(shape, vecTy.getElementType());
slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
}
>From e09af33442603a33b5f5585de7eff87cd8afd814 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Wed, 11 Feb 2026 05:28:57 +0000
Subject: [PATCH 5/7] polish
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index fa5810ad7f828..c47fd92fe46d7 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -390,7 +390,6 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
SmallVector<int64_t> staticStrides(offsets.size(), 1);
-
Value slice = vector::ExtractStridedSliceOp::create(
builder, loc, value, offsets, adjustedTargetShape, staticStrides);
>From b6dedaf8f5f2abd5a8dfcf36c09d01a2993b3e27 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 12 Feb 2026 04:20:12 +0000
Subject: [PATCH 6/7] fix minor issue
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 4eb6ad51ee9bf..49b66d2a8f6f6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -138,13 +138,10 @@ template <typename T, typename>
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
Value value;
- Operation *ownerOp;
if constexpr (std::is_same_v<T, OpOperand>) {
value = operandOrResult.get();
- ownerOp = operandOrResult.getOwner();
} else {
value = (Value)operandOrResult;
- ownerOp = value.getDefiningOp();
}
xegpu::DistributeLayoutAttr layout =
>From 3e248b7557bee917e6550d116682e38ec295da29 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li at intel.com>
Date: Thu, 12 Feb 2026 17:34:02 +0000
Subject: [PATCH 7/7] remove comments
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ce5f4f887e910..7671e2bbc3322 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1153,9 +1153,6 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
- // SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
- // newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
rewriter.setInsertionPointAfter(newWarpOp);
// Distributed load op will always be 1D.
More information about the Mlir-commits
mailing list