[Mlir-commits] [mlir] [mlir][XeGPU] add unroll patterns for load_matrix and store_matrix (PR #154637)
Chao Chen
llvmlistbot at llvm.org
Wed Sep 3 08:10:01 PDT 2025
https://github.com/chencha3 updated https://github.com/llvm/llvm-project/pull/154637
>From 22d4193fdec06afe6f3a3518480fe6cd32e4f0dc Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 20 Aug 2025 22:09:11 +0000
Subject: [PATCH 1/6] add unroll pattern and unit test for load_matrix and
store_matrix
---
.../mlir/Dialect/XeGPU/Transforms/Passes.td | 4 +-
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 16 ++++
.../XeGPU/Transforms/XeGPUBlocking.cpp | 12 +--
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 87 +++++++++++++++++--
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 46 ++++++++++
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 23 +++++
6 files changed, 176 insertions(+), 12 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3a88dae041dd1..ddf6b4ac85a90 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> {
to a hardware instruction.
}];
let dependentDialects = [
- "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
- ];
+ "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect",
+ "index::IndexDialect"];
}
#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index db8608c6d20b8..a40dc74edb200 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -10,6 +10,7 @@
#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
namespace mlir {
class VectorType;
@@ -18,6 +19,7 @@ class OpResult;
class OpBuilder;
class ValueRange;
class TypeConverter;
+class OpFoldResult;
namespace xegpu {
class LayoutAttr;
@@ -128,6 +130,20 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op,
/// if no GPU module parent or XeVM target attribute exists.
std::optional<std::string> getChipStr(Operation *op);
+/// Generates element-wise addition ops of two arrays with automatic alignment.
+/// When the input arrays have different sizes, the shorter array is
+/// right-aligned with the longer array, and the unmatched leading elements from
+/// the longer array are preserved unchanged. This is commonly used for offset
+/// computation where higher-dimensional offsets need to be added to
+/// lower-dimensional adjustments.
+///
+/// Example:
+/// lhs = [l1, l2, l3], rhs = [r1, r2]
+/// Result: [11, l2+r1, l3+r2]
+SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
+ ArrayRef<OpFoldResult> lhs,
+ ArrayRef<OpFoldResult> rhs);
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index d82c541f31359..b11f5fe87559b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -8,6 +8,7 @@
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -155,10 +156,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
- xegpu::UpdateOffsetOp>(op))
+ xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
- xegpu::LoadGatherOp>(op))
+ xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op))
return getTileShape(op->getOpOperand(0));
if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op))
return getTileShape(op->getOpOperand(1));
@@ -202,17 +203,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
// skip the op if any of its operands or results has workgroup level layouts
- bool hasWgLayoutOperands =
+ bool hasSgLayoutOperands =
llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr);
return layout && layout.isWgLayout();
});
- bool hasWgLayoutResults =
+ bool hasSgLayoutResults =
llvm::any_of(op->getOpResults(), [](OpResult result) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result);
return layout && layout.isWgLayout();
});
- if (hasWgLayoutOperands || hasWgLayoutResults) {
+
+ if (hasSgLayoutOperands || hasSgLayoutResults) {
LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
return false;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c793b71639e86..219e4e6f44618 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
}
};
+struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
+ using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getType();
+ Type elemTy = valueTy.getElementType();
+ ArrayRef<int64_t> shape = valueTy.getShape();
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+
+ VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+
+ SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
+ SmallVector<SmallVector<OpFoldResult>> offsetsList;
+ for (SmallVector<int64_t> offsets :
+ StaticTileOffsetRange(shape, *targetShape)) {
+ auto adds = xegpu::addWithRightAligned(
+ rewriter, loc, mixedOffsets,
+ getAsIndexOpFoldResult(op.getContext(), offsets));
+ offsetsList.push_back(adds);
+ }
+
+ SmallVector<Value> newOps;
+ for (SmallVector<OpFoldResult> offsets : offsetsList) {
+ auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
+ op.getLoc(), newValueTy, op.getMemDesc(), offsets,
+ layout.dropInstData());
+ newOps.push_back(newOp);
+ }
+ Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+ rewriter.replaceOp(op, castOp);
+ return success();
+ }
+};
+
+struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
+ using UnrollPattern<xegpu::StoreMatrixOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getData().getType();
+ ArrayRef<int64_t> shape = valueTy.getShape();
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+
+ SmallVector<Type> convertedValTypes =
+ getUnrolledTypes(valueTy, *targetShape);
+ SmallVector<Value> convertedValues =
+ pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter);
+
+ SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
+ SmallVector<SmallVector<OpFoldResult>> offsetsList;
+ for (SmallVector<int64_t> offsets :
+ StaticTileOffsetRange(shape, *targetShape)) {
+ auto adds = xegpu::addWithRightAligned(
+ rewriter, loc, mixedOffsets,
+ getAsIndexOpFoldResult(op.getContext(), offsets));
+ offsetsList.push_back(adds);
+ }
+
+ for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList))
+ rewriter.create<xegpu::StoreMatrixOp>(loc, v, op.getMemDesc(), offsets,
+ layout.dropInstData());
+
+ rewriter.eraseOp(op);
+ return success();
+ }
+};
+
} // namespace
void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
- patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
- UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
- UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
- UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
- options);
+ patterns
+ .add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
+ UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
+ UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
+ UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp>(
+ patterns.getContext(), options);
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 19eedbac0f76b..088e8a8c497d9 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -12,6 +12,7 @@
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -133,6 +134,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
+ // for LoadMatrixOp, the layout is attached to the property of the op
+ if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+
+ // for StoreMatrixOp, the layout is attached to the property of the op
+ if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+
std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -152,6 +161,13 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
+
+ if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+
+ if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+
std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
return op->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -179,6 +195,8 @@ xegpu::setLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand,
void xegpu::setLayoutAttrs(Operation *op,
function_ref<LayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
+ if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
+ return;
for (OpOperand &opr : nestOp->getOpOperands()) {
auto layout = getLayoutImpl(opr.get());
setLayoutAttr(opr, layout);
@@ -424,3 +442,31 @@ std::optional<std::string> xegpu::getChipStr(Operation *op) {
return std::nullopt;
}
+
+/// Generates element-wise addition ops of two arrays with automatic alignment.
+/// When the input arrays have different sizes, the shorter array is
+/// right-aligned with the longer array, and the unmatched leading elements from
+/// the longer array are preserved unchanged. This is commonly used for offset
+/// computation where higher-dimensional offsets need to be added to
+/// lower-dimensional adjustments.
+///
+/// Example:
+/// lhs = [l1, l2, l3], rhs = [r1, r2]
+/// Result: [11, l2+r1, l3+r2]
+SmallVector<OpFoldResult>
+xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
+ ArrayRef<OpFoldResult> lhs,
+ ArrayRef<OpFoldResult> rhs) {
+ // ensure a is longer than b
+ ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
+ ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
+ SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
+ a = a.slice(a.size() - b.size());
+ for (auto [l, r] : llvm::zip(a, b)) {
+ auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
+ auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
+ results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
+ }
+ return results;
+ return {};
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d986e5bd1cfb4..9d63c2ddd4895 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -561,3 +561,26 @@ gpu.module @test_kernel {
gpu.return %e : vector<8x32x2xf16>
}
}
+
+// -----
+gpu.module @test_kernel {
+ //CHECK-LABEL: unroll_load_matrix
+ gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> {
+ %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
+ //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
+ //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
+ %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
+ gpu.return %1: vector<32x32xf32>
+ }
+}
+
+// -----
+gpu.module @test_kernel {
+ // CHECK-LABEL: unroll_store_matrix
+ gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
+ %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
+ // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
+ xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+ gpu.return
+ }
+}
>From d20da858449bfd926df69de3f2b777cae4ee2f24 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 20 Aug 2025 22:09:11 +0000
Subject: [PATCH 2/6] add unroll pattern and unit test for load_matrix and
store_matrix
---
.../mlir/Dialect/XeGPU/Transforms/Passes.td | 4 +-
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 1 +
.../XeGPU/Transforms/XeGPUBlocking.cpp | 12 +--
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 87 +++++++++++++++++--
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 17 ++++
mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 23 +++++
6 files changed, 132 insertions(+), 12 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3a88dae041dd1..ddf6b4ac85a90 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> {
to a hardware instruction.
}];
let dependentDialects = [
- "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
- ];
+ "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect",
+ "index::IndexDialect"];
}
#endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index b2b2d3ab85231..a40dc74edb200 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -19,6 +19,7 @@ class OpResult;
class OpBuilder;
class ValueRange;
class TypeConverter;
+class OpFoldResult;
namespace xegpu {
class LayoutAttr;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index b3144e4c1e55d..fb4f00b21f2b9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -8,6 +8,7 @@
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
@@ -155,10 +156,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
- xegpu::UpdateOffsetOp>(op))
+ xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
- xegpu::LoadGatherOp>(op))
+ xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op))
return getTileShape(op->getOpOperand(0));
if (isa<xegpu::StoreNdOp, xegpu::StoreScatterOp>(op))
return getTileShape(op->getOpOperand(1));
@@ -202,17 +203,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
// skip the op if any of its operands or results has workgroup level layouts
- bool hasWgLayoutOperands =
+ bool hasSgLayoutOperands =
llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr);
return layout && layout.isForWorkgroup();
});
- bool hasWgLayoutResults =
+ bool hasSgLayoutResults =
llvm::any_of(op->getOpResults(), [](OpResult result) {
xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result);
return layout && layout.isForWorkgroup();
});
- if (hasWgLayoutOperands || hasWgLayoutResults) {
+
+ if (hasSgLayoutOperands || hasSgLayoutResults) {
LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
return false;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c793b71639e86..219e4e6f44618 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
}
};
+struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
+ using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getType();
+ Type elemTy = valueTy.getElementType();
+ ArrayRef<int64_t> shape = valueTy.getShape();
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+
+ VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+
+ SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
+ SmallVector<SmallVector<OpFoldResult>> offsetsList;
+ for (SmallVector<int64_t> offsets :
+ StaticTileOffsetRange(shape, *targetShape)) {
+ auto adds = xegpu::addWithRightAligned(
+ rewriter, loc, mixedOffsets,
+ getAsIndexOpFoldResult(op.getContext(), offsets));
+ offsetsList.push_back(adds);
+ }
+
+ SmallVector<Value> newOps;
+ for (SmallVector<OpFoldResult> offsets : offsetsList) {
+ auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
+ op.getLoc(), newValueTy, op.getMemDesc(), offsets,
+ layout.dropInstData());
+ newOps.push_back(newOp);
+ }
+ Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+ rewriter.replaceOp(op, castOp);
+ return success();
+ }
+};
+
+struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
+ using UnrollPattern<xegpu::StoreMatrixOp>::UnrollPattern;
+ LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op,
+ PatternRewriter &rewriter) const override {
+ std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+ if (!targetShape)
+ return failure();
+
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getData().getType();
+ ArrayRef<int64_t> shape = valueTy.getShape();
+ auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
+
+ SmallVector<Type> convertedValTypes =
+ getUnrolledTypes(valueTy, *targetShape);
+ SmallVector<Value> convertedValues =
+ pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter);
+
+ SmallVector<OpFoldResult> mixedOffsets = op.getMixedOffsets();
+ SmallVector<SmallVector<OpFoldResult>> offsetsList;
+ for (SmallVector<int64_t> offsets :
+ StaticTileOffsetRange(shape, *targetShape)) {
+ auto adds = xegpu::addWithRightAligned(
+ rewriter, loc, mixedOffsets,
+ getAsIndexOpFoldResult(op.getContext(), offsets));
+ offsetsList.push_back(adds);
+ }
+
+ for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList))
+ rewriter.create<xegpu::StoreMatrixOp>(loc, v, op.getMemDesc(), offsets,
+ layout.dropInstData());
+
+ rewriter.eraseOp(op);
+ return success();
+ }
+};
+
} // namespace
void mlir::xegpu::populateXeGPUUnrollPatterns(
RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
- patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
- UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
- UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
- UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
- options);
+ patterns
+ .add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
+ UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp, UnrollCreateDescOp,
+ UnrollLoadGatherOp, UnrollStoreScatterOp, UnrollPrefetchOp,
+ UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp>(
+ patterns.getContext(), options);
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 6835f64ad8ef7..f77749fd77831 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -134,6 +134,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
+ // for LoadMatrixOp, the layout is attached to the property of the op
+ if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+
+ // for StoreMatrixOp, the layout is attached to the property of the op
+ if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+
std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
return defOp->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -153,6 +161,13 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
+
+ if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+
+ if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
+ return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+
std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
return op->getAttrOfType<xegpu::LayoutAttr>(layoutName);
@@ -180,6 +195,8 @@ xegpu::setLayoutAttr<mlir::OpOperand>(const mlir::OpOperand &operand,
void xegpu::setLayoutAttrs(Operation *op,
function_ref<LayoutAttr(Value)> getLayoutImpl) {
op->walk([&](Operation *nestOp) {
+ if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
+ return;
for (OpOperand &opr : nestOp->getOpOperands()) {
auto layout = getLayoutImpl(opr.get());
setLayoutAttr(opr, layout);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index d986e5bd1cfb4..9d63c2ddd4895 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -561,3 +561,26 @@ gpu.module @test_kernel {
gpu.return %e : vector<8x32x2xf16>
}
}
+
+// -----
+gpu.module @test_kernel {
+ //CHECK-LABEL: unroll_load_matrix
+ gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> {
+ %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
+ //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32>
+ //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
+ %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<inst_data = [8, 16]>}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32>
+ gpu.return %1: vector<32x32xf32>
+ }
+}
+
+// -----
+gpu.module @test_kernel {
+ // CHECK-LABEL: unroll_store_matrix
+ gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) {
+ %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
+ // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index
+ xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout<inst_data = [8, 16]>} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>
+ gpu.return
+ }
+}
>From 442c18aed86ab2423e20401d8c17f9c5b73543b3 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 27 Aug 2025 17:54:16 +0000
Subject: [PATCH 3/6] merge
---
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 95aa1fc58f4f6..2e17e559fdd2d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -136,11 +136,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
// for LoadMatrixOp, the layout is attached to the property of the op
if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(defOp))
- return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+ return loadOp.getLayoutAttr();
// for StoreMatrixOp, the layout is attached to the property of the op
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(defOp))
- return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+ return storeOp.getLayoutAttr();
std::string layoutName = getLayoutName(result);
if (defOp->hasAttr(layoutName))
@@ -164,10 +164,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
Operation *op = opr.getOwner();
if (auto loadOp = dyn_cast<xegpu::LoadMatrixOp>(op))
- return dyn_cast_if_present<xegpu::LayoutAttr>(loadOp.getLayoutAttr());
+ return loadOp.getLayoutAttr();
if (auto storeOp = dyn_cast<xegpu::StoreMatrixOp>(op))
- return dyn_cast_if_present<xegpu::LayoutAttr>(storeOp.getLayoutAttr());
+ return storeOp.getLayoutAttr();
std::string layoutName = xegpu::getLayoutName(opr);
if (op->hasAttr(layoutName))
@@ -199,6 +199,7 @@ void xegpu::setDistributeLayoutAttrs(
op->walk([&](Operation *nestOp) {
if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(nestOp))
return;
+
for (OpOperand &opr : nestOp->getOpOperands()) {
auto layout = getLayoutImpl(opr.get());
setDistributeLayoutAttr(opr, layout);
@@ -471,5 +472,4 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
}
return results;
- return {};
}
>From a368430b6636f3285062c45b7657d9d9103485f7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 27 Aug 2025 17:59:56 +0000
Subject: [PATCH 4/6] roll back unnecessary change
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 1ccb4e89fd6a5..5d5ff69e06886 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -205,20 +205,19 @@ XeGPUBlockingPass::getTileShape(Operation *op) const {
bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
// skip the op if any of its operands or results has workgroup level layouts
- bool hasSgLayoutOperands =
+ bool hasWgLayoutOperands =
llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
xegpu::DistributeLayoutAttr layout =
xegpu::getDistributeLayoutAttr(opr);
return layout && layout.isForWorkgroup();
});
- bool hasSgLayoutResults =
+ bool hasWgLayoutResults =
llvm::any_of(op->getOpResults(), [](OpResult result) {
xegpu::DistributeLayoutAttr layout =
xegpu::getDistributeLayoutAttr(result);
return layout && layout.isForWorkgroup();
});
-
- if (hasSgLayoutOperands || hasSgLayoutResults) {
+ if (hasWgLayoutOperands || hasWgLayoutResults) {
LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
return false;
}
>From 3f5d69299a65cc1854e4fe502dd2628629a31599 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 27 Aug 2025 18:26:19 +0000
Subject: [PATCH 5/6] add unit test
---
mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 27 +++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index f4a49da71605f..c0fb373835e3d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -26,6 +26,33 @@ gpu.module @test_1_1_assignment {
gpu.return
}
+ // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref
+ // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32>
+ gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) {
+ //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
+ //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
+ //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
+ //CHECK: [[C32:%.+]] = arith.constant 32 : index
+ //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
+ //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
+ //CHECK: [[C0:%.+]] = arith.constant 0 : index
+ //CHECK: [[C0_2:%.+]] = arith.constant 0 : index
+ //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
+ //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index
+ //CHECK: [[C256:%.+]] = arith.constant 256 : index
+ //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[C128:%.+]] = arith.constant 128 : index
+ //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[C0_3:%.+]] = arith.constant 0 : index
+ //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]]
+ //CHECK: [[C0_4:%.+]] = arith.constant 0 : index
+ //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]]
+ //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
+ -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+
// CHECK-LABEL: load_nd_tdesc
// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
>From a0512d9f95f01b79b13192a37899972ad3314dfb Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Wed, 3 Sep 2025 15:09:44 +0000
Subject: [PATCH 6/6] address comments
---
.../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +++++-
.../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 14 ++++++-------
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 21 ++++++++++++++-----
3 files changed, 28 insertions(+), 13 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 701d851eade35..04cfd58d846a7 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -144,6 +144,11 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op,
/// if no GPU module parent or XeVM target attribute exists.
std::optional<std::string> getChipStr(Operation *op);
+/// Generates element-wise addition ops of two arrays with same length.
+SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc,
+ ArrayRef<OpFoldResult> lhs,
+ ArrayRef<OpFoldResult> rhs);
+
/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
@@ -157,7 +162,6 @@ std::optional<std::string> getChipStr(Operation *op);
SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> lhs,
ArrayRef<OpFoldResult> rhs);
-
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 219e4e6f44618..d24d82780ebaa 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -686,12 +686,12 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
using UnrollPattern<xegpu::LoadMatrixOp>::UnrollPattern;
LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+ VectorType valueTy = op.getType();
std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
+ if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
return failure();
- Location loc = op.getLoc();
- VectorType valueTy = op.getType();
Type elemTy = valueTy.getElementType();
ArrayRef<int64_t> shape = valueTy.getShape();
auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
@@ -702,17 +702,17 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
SmallVector<SmallVector<OpFoldResult>> offsetsList;
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(shape, *targetShape)) {
- auto adds = xegpu::addWithRightAligned(
+ auto adds = xegpu::addElementwise(
rewriter, loc, mixedOffsets,
getAsIndexOpFoldResult(op.getContext(), offsets));
offsetsList.push_back(adds);
}
SmallVector<Value> newOps;
+ layout = layout.dropInstData();
for (SmallVector<OpFoldResult> offsets : offsetsList) {
auto newOp = rewriter.create<xegpu::LoadMatrixOp>(
- op.getLoc(), newValueTy, op.getMemDesc(), offsets,
- layout.dropInstData());
+ op.getLoc(), newValueTy, op.getMemDesc(), offsets, layout);
newOps.push_back(newOp);
}
Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
@@ -743,7 +743,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
SmallVector<SmallVector<OpFoldResult>> offsetsList;
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(shape, *targetShape)) {
- auto adds = xegpu::addWithRightAligned(
+ auto adds = xegpu::addElementwise(
rewriter, loc, mixedOffsets,
getAsIndexOpFoldResult(op.getContext(), offsets));
offsetsList.push_back(adds);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 2e17e559fdd2d..b72d5648b29f9 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -447,6 +447,21 @@ std::optional<std::string> xegpu::getChipStr(Operation *op) {
return std::nullopt;
}
+/// Generates element-wise addition ops of two arrays with same length.
+SmallVector<OpFoldResult> xegpu::addElementwise(OpBuilder &builder,
+ Location loc,
+ ArrayRef<OpFoldResult> lhs,
+ ArrayRef<OpFoldResult> rhs) {
+ assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
+ SmallVector<OpFoldResult> results;
+ for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
+ auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
+ auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
+ results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
+ }
+ return results;
+}
+
/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
@@ -466,10 +481,6 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
a = a.slice(a.size() - b.size());
- for (auto [l, r] : llvm::zip(a, b)) {
- auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
- auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
- results.push_back(builder.createOrFold<index::AddOp>(loc, lval, rval));
- }
+ results.append(addElementwise(builder, loc, a, b));
return results;
}
More information about the Mlir-commits
mailing list