[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)
Charitha Saumya
llvmlistbot at llvm.org
Wed Apr 16 13:59:38 PDT 2025
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/135271
>From 39dcf9dbcd85ee7f9b413f6ae01128420d0f7ad0 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Mar 2025 20:32:24 +0000
Subject: [PATCH 01/53] save work
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 8 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 6 +-
.../Transforms/XeGPUSubgroupDistribute.cpp | 74 +++++++++++++++++--
3 files changed, 77 insertions(+), 11 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78c242571935c..f09919f99c756 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -131,10 +131,10 @@ LogicalResult
SGMapAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
llvm::ArrayRef<uint32_t> wi_layout,
llvm::ArrayRef<uint32_t> wi_data) {
- if (wi_layout.size() != 2)
- return emitError() << "expected wi_layout of size 2";
- if (wi_data.size() != 2)
- return emitError() << "expected wi_data of size 2";
+ if (wi_layout.size() != 1 && wi_layout.size() != 2)
+ return emitError() << "expected 1D or 2D wi_layout";
+ if (wi_data.size() != 1 && wi_data.size() != 2)
+ return emitError() << "expected 1D or 2D wi_data";
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3bdf3fb218b45..89b96383699f6 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -95,12 +95,14 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
assert(succeeded(expectedValueShapeOrFailure) &&
"Failed to compute distributed vector shape for "
"tensor descriptor ");
+ bool isSIMD = valueShape == adjustedTdescShape;
+ bool isSIMT = valueShape == expectedValueShapeOrFailure.value().getShape();
- return valueTy == expectedValueShapeOrFailure.value()
+ return (isSIMD || isSIMT)
? success()
: emitError()
<< "Result shape " << makeString(valueShape)
- << " is not consistent with distributed vector shape "
+ << " is not consistent with SIMD/SIMT vector shape "
<< makeString(expectedValueShapeOrFailure.value().getShape())
<< " for tensor descriptor " << tdescTy;
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9531837625878..2e8f91b252ab0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -15,9 +15,16 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
namespace mlir {
@@ -120,6 +127,8 @@ struct SGMap {
const WiLayout &getLayout() const { return wiLayout; }
const WiData &getData() const { return wiData; }
+ ArrayRef<int64_t> getLayoutAsArrayRef() const { return wiLayout.layout; }
+ ArrayRef<int64_t> getDataAsArrayRef() const { return wiData.layout; }
};
void SGMap::print(raw_ostream &os) const {
@@ -223,6 +232,10 @@ static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
return getDefaultSgMap(vectorTy);
}
+static SGMap getSupportedSGMapForOp(Operation *op) {
+ return getDefaultSgMap(2);
+}
+
///===----------------------------------------------------------------------===///
/// SGMapPropagation
///===----------------------------------------------------------------------===///
@@ -634,6 +647,56 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
+static LogicalResult
+attachLayoutAttributes(Operation *top,
+ llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
+ /// Helper to convert SGMap to xegpu::SGMapAttr.
+ auto getSGMapForResult = [&](Value r) -> Attribute {
+ auto layout = getPropagatedLayout(r);
+ if (!layout.isAssigned())
+ return {};
+ SmallVector<uint32_t, 2> wiLayout, wiData;
+ for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+ layout.getDataAsArrayRef())) {
+ wiLayout.push_back(static_cast<uint32_t>(layout));
+ wiData.push_back(static_cast<uint32_t>(data));
+ }
+ return xegpu::SGMapAttr::get(top->getContext(), wiLayout, wiData);
+ };
+ /// Attach the layout attributes to the results of the operations.
+ top->walk([&](Operation *op) {
+ /// If no results, skip the operation.
+ if (op->getNumResults() == 0)
+ return;
+ if (auto tensorDescTy =
+ dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+ auto sgMapAttr = getSGMapForResult(op->getResult(0));
+ if (!sgMapAttr)
+ op->emitError("Expecting a layout for the result tensor descriptor.");
+ /// Clone the op, attach the sg_map to the result tensor descriptor, and
+ /// remove the original op.
+ OpBuilder builder(op);
+ auto *newOp = builder.clone(*op);
+ auto newTensorDescTy = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), sgMapAttr);
+ newOp->getResult(0).setType(newTensorDescTy);
+ op->replaceAllUsesWith(newOp->getResults());
+ op->erase();
+ return;
+ }
+ /// Otherwise simply attach the sg_map to the op itself.
+ for (auto [i, r] : llvm::enumerate(op->getResults())) {
+ auto sgMapAttr = getSGMapForResult(r);
+ if (sgMapAttr) {
+ auto attrName = "r" + std::to_string(i);
+ op->setAttr(attrName, sgMapAttr);
+ }
+ }
+ });
+ return success();
+}
+
namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
@@ -648,13 +711,14 @@ struct XeGPUSubgroupDistributePass final
} // namespace
void XeGPUSubgroupDistributePass::runOnOperation() {
- Operation *op = getOperation();
- RunSGMapPropagation solver(op);
-
- // Print the analysis result and exit.
+ auto &analyis = getAnalysis<RunSGMapPropagation>();
+ // Print the analysis result and exit. (for testing purposes)
if (printOnly) {
auto &os = llvm::outs();
- solver.printAnalysisResult(os);
+ analyis.printAnalysisResult(os);
return;
}
+ auto getPropagatedLayout = [&](Value val) { return analyis.getSGMap(val); };
+ if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
+ signalPassFailure();
}
>From 20587736992b0eac55454006b67d53225d5c9494 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 20 Mar 2025 21:12:33 +0000
Subject: [PATCH 02/53] moving all ops to region working
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 108 ++++++-------
.../Transforms/XeGPUSubgroupDistribute.cpp | 148 +++++++++++++++++-
2 files changed, 196 insertions(+), 60 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 89b96383699f6..f82084eed6570 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -569,60 +569,60 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
// XeGPU_DpasOp
//===----------------------------------------------------------------------===//
LogicalResult DpasOp::verify() {
- int64_t lhsRank = getLhsType().getRank();
- int64_t rhsRank = getRhsType().getRank();
- int64_t resultRank = getResultType().getRank();
- auto lhsShape = getLhsType().getShape();
- auto rhsShape = getRhsType().getShape();
- auto resultShape = getResultType().getShape();
-
- auto sgMapA = getSgMapAAttr();
- auto sgMapB = getSgMapBAttr();
- auto sgMapC = getSgMapCAttr();
-
- // If sg_maps are not present, then the operation is in SIMD mode.
- if (!sgMapA && !sgMapB && !sgMapC) {
- if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
- return emitOpError(
- "expecting lhs and result to be a 2D vector, and rhs to be either "
- "2D or 3D (packed) vector.");
- auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
- if (bK != lhsShape[1])
- return emitOpError("K-dimension mismatch.");
- if (lhsShape[0] != resultShape[0])
- return emitOpError("M-dimension mismatch.");
- if (rhsShape[1] != resultShape[1])
- return emitOpError("N-dimension mismatch.");
- return success();
- }
- // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
- // result of DPAS operation.
- if (!sgMapA || !sgMapB || !sgMapC)
- return emitOpError("sg_map attributes for all operands and outputs are "
- "expected in SIMT xegpu::Dpas operation");
-
- // In SIMT mode, All data fragments must be 2D
- if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
- return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
- auto wiLayoutA = sgMapA.getWiLayout();
- auto wiLayoutB = sgMapB.getWiLayout();
- auto wiLayoutC = sgMapC.getWiLayout();
- // Obtain the expanded shapes of the operands and result using wi_layout.
- // NOTE: For B, get rid of the packed dimension for the expanded shape.
- SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
- lhsShape[1] * wiLayoutA[1]};
- SmallVector<int64_t> expandedShapeB = {
- rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
- SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
- resultShape[1] * wiLayoutC[1]};
- auto bK = expandedShapeB[0];
- if (bK != expandedShapeA[1])
- return emitOpError("K-dimension mismatch.");
- if (expandedShapeA[0] != expandedShapeC[0])
- return emitOpError("M-dimension mismatch.");
- if (expandedShapeB[1] != expandedShapeC[1])
- return emitOpError("N-dimension mismatch.");
+ // int64_t lhsRank = getLhsType().getRank();
+ // int64_t rhsRank = getRhsType().getRank();
+ // int64_t resultRank = getResultType().getRank();
+ // auto lhsShape = getLhsType().getShape();
+ // auto rhsShape = getRhsType().getShape();
+ // auto resultShape = getResultType().getShape();
+
+ // auto sgMapA = getSgMapAAttr();
+ // auto sgMapB = getSgMapBAttr();
+ // auto sgMapC = getSgMapCAttr();
+
+ // // If sg_maps are not present, then the operation is in SIMD mode.
+ // if (!sgMapA && !sgMapB && !sgMapC) {
+ // if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+ // return emitOpError(
+ // "expecting lhs and result to be a 2D vector, and rhs to be either "
+ // "2D or 3D (packed) vector.");
+ // auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+ // if (bK != lhsShape[1])
+ // return emitOpError("K-dimension mismatch.");
+ // if (lhsShape[0] != resultShape[0])
+ // return emitOpError("M-dimension mismatch.");
+ // if (rhsShape[1] != resultShape[1])
+ // return emitOpError("N-dimension mismatch.");
+ // return success();
+ // }
+ // // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
+ // // result of DPAS operation.
+ // if (!sgMapA || !sgMapB || !sgMapC)
+ // return emitOpError("sg_map attributes for all operands and outputs are "
+ // "expected in SIMT xegpu::Dpas operation");
+
+ // // In SIMT mode, All data fragments must be 2D
+ // if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
+ // return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+ // auto wiLayoutA = sgMapA.getWiLayout();
+ // auto wiLayoutB = sgMapB.getWiLayout();
+ // auto wiLayoutC = sgMapC.getWiLayout();
+ // // Obtain the expanded shapes of the operands and result using wi_layout.
+ // // NOTE: For B, get rid of the packed dimension for the expanded shape.
+ // SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+ // lhsShape[1] * wiLayoutA[1]};
+ // SmallVector<int64_t> expandedShapeB = {
+ // rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+ // SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
+ // resultShape[1] * wiLayoutC[1]};
+ // auto bK = expandedShapeB[0];
+ // if (bK != expandedShapeA[1])
+ // return emitOpError("K-dimension mismatch.");
+ // if (expandedShapeA[0] != expandedShapeC[0])
+ // return emitOpError("M-dimension mismatch.");
+ // if (expandedShapeB[1] != expandedShapeC[1])
+ // return emitOpError("N-dimension mismatch.");
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 2e8f91b252ab0..8ec817693a183 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -17,9 +17,17 @@
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Value.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/InliningUtils.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -34,6 +42,9 @@ namespace xegpu {
} // namespace xegpu
} // namespace mlir
+#define DEBUG_TYPE "xegpu-subgroup-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
using namespace mlir;
using namespace mlir::dataflow;
@@ -647,6 +658,27 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
+void attachLayoutAttributeToUsers(Value v, Attribute layout) {
+ for (OpOperand &user : v.getUses()) {
+ Operation *owner = user.getOwner();
+ unsigned operandNumber = user.getOperandNumber();
+ /// If the user is a DpasOp, set "sg_map_a", "sg_map_b", or "sg_map_c"
+ /// attribute.
+ if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
+ if (operandNumber == 0)
+ dpasOp->setAttr("sg_map_a", layout);
+ else if (operandNumber == 1)
+ dpasOp->setAttr("sg_map_b", layout);
+ else if (operandNumber == 2)
+ dpasOp->setAttr("sg_map_c", layout);
+ continue;
+ }
+ /// For every other user, use a generic attribute name.
+ std::string attrName = "op" + std::to_string(operandNumber);
+ owner->setAttr(attrName, layout);
+ }
+}
+
static LogicalResult
attachLayoutAttributes(Operation *top,
llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
@@ -664,15 +696,18 @@ attachLayoutAttributes(Operation *top,
return xegpu::SGMapAttr::get(top->getContext(), wiLayout, wiData);
};
/// Attach the layout attributes to the results of the operations.
- top->walk([&](Operation *op) {
- /// If no results, skip the operation.
+ auto walkResult = top->walk([&](Operation *op) {
+ /// If no results, move on.
if (op->getNumResults() == 0)
- return;
+ return WalkResult::advance();
if (auto tensorDescTy =
dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
auto sgMapAttr = getSGMapForResult(op->getResult(0));
- if (!sgMapAttr)
- op->emitError("Expecting a layout for the result tensor descriptor.");
+ if (!sgMapAttr) {
+ LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+ return WalkResult::interrupt();
+ }
+
/// Clone the op, attach the sg_map to the result tensor descriptor, and
/// remove the original op.
OpBuilder builder(op);
@@ -683,7 +718,7 @@ attachLayoutAttributes(Operation *top,
newOp->getResult(0).setType(newTensorDescTy);
op->replaceAllUsesWith(newOp->getResults());
op->erase();
- return;
+ return WalkResult::advance();
}
/// Otherwise simply attach the sg_map to the op itself.
for (auto [i, r] : llvm::enumerate(op->getResults())) {
@@ -691,12 +726,104 @@ attachLayoutAttributes(Operation *top,
if (sgMapAttr) {
auto attrName = "r" + std::to_string(i);
op->setAttr(attrName, sgMapAttr);
+ /// Attach the layout attribute to the users of the result.
+ attachLayoutAttributeToUsers(r, sgMapAttr);
}
}
+ return WalkResult::advance();
});
+
+ return failure(walkResult.wasInterrupted());
+}
+
+static LogicalResult resolveLayoutConflicts(Operation *top) {
+ /// TODO: Implement the layout conflict resolution.
return success();
}
+namespace {
+
+struct MoveFuncBodyToWarpExecuteOnLane0
+ : public OpRewritePattern<gpu::GPUFuncOp> {
+ using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFunc,
+ PatternRewriter &rewriter) const override {
+ // if the function contains warp_execute_on_lane0, return
+ if (llvm::any_of(gpuFunc.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::WarpExecuteOnLane0Op>(op);
+ }))
+ return failure();
+ // // if the first op is already warp_execute_on_lane0, return
+ // auto &body = gpuFunc.getBody();
+ // auto &entryBlock = body.front();
+ // if (entryBlock.empty())
+ // return failure();
+ // // llvm::errs() << "entry block: " << entryBlock << "\n";
+ // auto &firstOp = entryBlock.front();
+ // if (isa<gpu::LaneIdOp>(firstOp))
+ // return failure();
+
+ // llvm::errs() << "First op: " << firstOp << "\n";
+
+ // create a new function with the same signature
+ auto newFunc = rewriter.create<gpu::GPUFuncOp>(
+ gpuFunc.getLoc(), gpuFunc.getName(), gpuFunc.getFunctionType());
+ rewriter.setInsertionPointToEnd(&newFunc.getFunctionBody().front());
+ auto laneId = rewriter.create<gpu::LaneIdOp>(
+ newFunc.getLoc(), rewriter.getIndexType(),
+ /** upperBound = **/ mlir::IntegerAttr());
+
+ // rewriter.startOpModification(gpuFunc);
+ // rewriter.setInsertionPoint(&firstOp);
+ auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+ laneId.getLoc(), newFunc->getResultTypes(), laneId, subgroupSize,
+ newFunc.getArguments(), newFunc.getArgumentTypes());
+ auto &warpBodyBlock = warpOp.getBodyRegion().front();
+ auto origRetunOp =
+ cast<gpu::ReturnOp>(gpuFunc.getBlocks().back().getTerminator());
+ rewriter.setInsertionPointAfter(origRetunOp);
+ rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+ origRetunOp.getOperands());
+ // erase return op
+ rewriter.eraseOp(origRetunOp);
+ // auto returnOp =
+ // cast<gpu::ReturnOp>(gpuFunc.getBlocks().end()->getTerminator());
+ // rewriter.startOpModification(returnOp);
+ // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
+ // newFunc.getArguments()); rewriter.finalizeOpModification(returnOp);
+ rewriter.inlineRegionBefore(gpuFunc.getBody(), warpOp.getBodyRegion(),
+ warpOp.getBodyRegion().begin());
+ rewriter.eraseBlock(&warpBodyBlock);
+ // auto &newWarpBody = warpOp.getBodyRegion();
+ // auto returnOp = cast<gpu::ReturnOp>(newWarpBody.end()->getTerminator());
+ // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
+ // returnOp.getOperands());
+ // rewriter.setInsertionPointToEnd(&warpOp.getBodyRegion().front());
+ // add a gpu.yield
+ // rewriter.create<gpu::YieldOp>(warpOp.getLoc(), warpOp.getResults());
+ // rewriter.inlineRegionBefore(gpuFunc.getBody(),
+ // warpOp.getBodyRegion(),
+ rewriter.setInsertionPointAfter(warpOp);
+ rewriter.create<gpu::ReturnOp>(newFunc.getLoc(), warpOp.getResults());
+ // warpOp.getBodyRegion().begin());
+ // // rewriter.eraseOp(gpuFunc);
+ // // get the function return op
+ // auto returnOp = cast<gpu::ReturnOp>(warpOp.getBody()->getTerminator());
+ // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
+ // returnOp.getOperands());
+ // // rewriter.eraseOp(returnOp);
+ // // create a new function return which retuns the result of the warp
+ // rewriter.setInsertionPointAfter(warpOp);
+ // rewriter.create<gpu::ReturnOp>(warpOp.getLoc(), warpOp.getResults());
+ // rewriter.finalizeOpModification(gpuFunc);
+ rewriter.replaceOp(gpuFunc, newFunc);
+ return success();
+ }
+};
+
+} // namespace
+
namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
@@ -721,4 +848,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
auto getPropagatedLayout = [&](Value val) { return analyis.getSGMap(val); };
if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
signalPassFailure();
+ if (failed(resolveLayoutConflicts(getOperation())))
+ signalPassFailure();
+ /// Move all operations inside a GPU functions inside
+ /// gpu.warp_execute_on_lane0
+ {
+ RewritePatternSet patterns(&getContext());
+ patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+ }
}
>From 14233fa812ac0f1743547a1cd13ac672b1a64a7e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 20 Mar 2025 21:35:24 +0000
Subject: [PATCH 03/53] moving all ops to region working
---
.../Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8ec817693a183..73256b822db29 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -855,6 +855,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
{
RewritePatternSet patterns(&getContext());
patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
- (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+ GreedyRewriteConfig config;
+ config.fold = false;
+ // config.cseConstants = false;
+ // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
}
}
>From f599873a8cbb823b1134993eb8a22591b0a409db Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 20 Mar 2025 22:25:31 +0000
Subject: [PATCH 04/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 84 +++++--------------
1 file changed, 23 insertions(+), 61 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 73256b822db29..82401f542543e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -243,10 +243,6 @@ static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
return getDefaultSgMap(vectorTy);
}
-static SGMap getSupportedSGMapForOp(Operation *op) {
- return getDefaultSgMap(2);
-}
-
///===----------------------------------------------------------------------===///
/// SGMapPropagation
///===----------------------------------------------------------------------===///
@@ -747,77 +743,42 @@ struct MoveFuncBodyToWarpExecuteOnLane0
: public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
- LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFunc,
+ LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
- // if the function contains warp_execute_on_lane0, return
- if (llvm::any_of(gpuFunc.getBody().getOps(), [](Operation &op) {
+ /// If the function all ready moved inside a warp_execute_on_lane0, skip.
+ if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::WarpExecuteOnLane0Op>(op);
}))
return failure();
- // // if the first op is already warp_execute_on_lane0, return
- // auto &body = gpuFunc.getBody();
- // auto &entryBlock = body.front();
- // if (entryBlock.empty())
- // return failure();
- // // llvm::errs() << "entry block: " << entryBlock << "\n";
- // auto &firstOp = entryBlock.front();
- // if (isa<gpu::LaneIdOp>(firstOp))
- // return failure();
-
- // llvm::errs() << "First op: " << firstOp << "\n";
-
- // create a new function with the same signature
- auto newFunc = rewriter.create<gpu::GPUFuncOp>(
- gpuFunc.getLoc(), gpuFunc.getName(), gpuFunc.getFunctionType());
- rewriter.setInsertionPointToEnd(&newFunc.getFunctionBody().front());
+ /// Create a new function with the same signature.
+ auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+ gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+ /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+ /// original gpuFuncOp.
+ rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
auto laneId = rewriter.create<gpu::LaneIdOp>(
- newFunc.getLoc(), rewriter.getIndexType(),
+ newGpuFunc.getLoc(), rewriter.getIndexType(),
/** upperBound = **/ mlir::IntegerAttr());
-
- // rewriter.startOpModification(gpuFunc);
- // rewriter.setInsertionPoint(&firstOp);
+ auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
- laneId.getLoc(), newFunc->getResultTypes(), laneId, subgroupSize,
- newFunc.getArguments(), newFunc.getArgumentTypes());
+ laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+ newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
auto &warpBodyBlock = warpOp.getBodyRegion().front();
+ /// Replace the ReturnOp of the original gpu function with a YieldOp.
auto origRetunOp =
- cast<gpu::ReturnOp>(gpuFunc.getBlocks().back().getTerminator());
+ cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
rewriter.setInsertionPointAfter(origRetunOp);
rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
origRetunOp.getOperands());
- // erase return op
rewriter.eraseOp(origRetunOp);
- // auto returnOp =
- // cast<gpu::ReturnOp>(gpuFunc.getBlocks().end()->getTerminator());
- // rewriter.startOpModification(returnOp);
- // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
- // newFunc.getArguments()); rewriter.finalizeOpModification(returnOp);
- rewriter.inlineRegionBefore(gpuFunc.getBody(), warpOp.getBodyRegion(),
+ /// Move the original function body to the warp body.
+ rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
warpOp.getBodyRegion().begin());
rewriter.eraseBlock(&warpBodyBlock);
- // auto &newWarpBody = warpOp.getBodyRegion();
- // auto returnOp = cast<gpu::ReturnOp>(newWarpBody.end()->getTerminator());
- // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
- // returnOp.getOperands());
- // rewriter.setInsertionPointToEnd(&warpOp.getBodyRegion().front());
- // add a gpu.yield
- // rewriter.create<gpu::YieldOp>(warpOp.getLoc(), warpOp.getResults());
- // rewriter.inlineRegionBefore(gpuFunc.getBody(),
- // warpOp.getBodyRegion(),
+ /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
rewriter.setInsertionPointAfter(warpOp);
- rewriter.create<gpu::ReturnOp>(newFunc.getLoc(), warpOp.getResults());
- // warpOp.getBodyRegion().begin());
- // // rewriter.eraseOp(gpuFunc);
- // // get the function return op
- // auto returnOp = cast<gpu::ReturnOp>(warpOp.getBody()->getTerminator());
- // rewriter.replaceOpWithNewOp<gpu::YieldOp>(returnOp,
- // returnOp.getOperands());
- // // rewriter.eraseOp(returnOp);
- // // create a new function return which retuns the result of the warp
- // rewriter.setInsertionPointAfter(warpOp);
- // rewriter.create<gpu::ReturnOp>(warpOp.getLoc(), warpOp.getResults());
- // rewriter.finalizeOpModification(gpuFunc);
- rewriter.replaceOp(gpuFunc, newFunc);
+ rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+ rewriter.replaceOp(gpuFuncOp, newGpuFunc);
return success();
}
};
@@ -855,10 +816,11 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
{
RewritePatternSet patterns(&getContext());
patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+ /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+ /// region.
GreedyRewriteConfig config;
+ config.cseConstants = false;
config.fold = false;
- // config.cseConstants = false;
- // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
(void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
}
}
>From 220ed1f95c6b55ab0f4fd932fd2d6b51caa04777 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 01:41:57 +0000
Subject: [PATCH 05/53] save work
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 82401f542543e..09e4e2ce2feba 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -745,7 +745,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
- /// If the function all ready moved inside a warp_execute_on_lane0, skip.
+ /// If the function already moved inside a warp_execute_on_lane0, skip.
if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::WarpExecuteOnLane0Op>(op);
}))
@@ -771,7 +771,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
origRetunOp.getOperands());
rewriter.eraseOp(origRetunOp);
- /// Move the original function body to the warp body.
+ /// Move the original function body to the WarpExecuteOnLane0Op body.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
warpOp.getBodyRegion().begin());
rewriter.eraseBlock(&warpBodyBlock);
>From 2a8070feae17b213c18b18510d1b09b088f7d274 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 16:04:53 +0000
Subject: [PATCH 06/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 455 ++++++++++++++++--
1 file changed, 417 insertions(+), 38 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 09e4e2ce2feba..db8c321487a1c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -10,6 +10,7 @@
#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
#include "mlir/Analysis/DataFlowFramework.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -739,52 +740,430 @@ static LogicalResult resolveLayoutConflicts(Operation *top) {
namespace {
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+/// ...
+/// ...
+/// gpu.yield %result: vector<8x16xf32>
+/// }
+/// ```
+/// To
+/// ```
+/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+/// %laneid = gpu.lane_id : index
+/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+/// ...
+/// ...
+/// gpu.yield %result: vector<8x16xf32>
+/// }
+/// return %0
+/// }
struct MoveFuncBodyToWarpExecuteOnLane0
: public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
-
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
- PatternRewriter &rewriter) const override {
- /// If the function already moved inside a warp_execute_on_lane0, skip.
- if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
- return isa<gpu::WarpExecuteOnLane0Op>(op);
- }))
- return failure();
- /// Create a new function with the same signature.
- auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
- gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
- /// Create a WarpExecuteOnLane0Op with same arguments and results as the
- /// original gpuFuncOp.
- rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
- auto laneId = rewriter.create<gpu::LaneIdOp>(
- newGpuFunc.getLoc(), rewriter.getIndexType(),
- /** upperBound = **/ mlir::IntegerAttr());
- auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
- auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
- laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
- newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
- auto &warpBodyBlock = warpOp.getBodyRegion().front();
- /// Replace the ReturnOp of the original gpu function with a YieldOp.
- auto origRetunOp =
- cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
- rewriter.setInsertionPointAfter(origRetunOp);
- rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
- origRetunOp.getOperands());
- rewriter.eraseOp(origRetunOp);
- /// Move the original function body to the WarpExecuteOnLane0Op body.
- rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
- warpOp.getBodyRegion().begin());
- rewriter.eraseBlock(&warpBodyBlock);
- /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
- rewriter.setInsertionPointAfter(warpOp);
- rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
- rewriter.replaceOp(gpuFuncOp, newGpuFunc);
- return success();
- }
+ PatternRewriter &rewriter) const override;
+};
+
+/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because it
+/// is a uniform value accorss all work items within the subgroup.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32>) {
+/// ...
+/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// vector.yield %td
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// vector.yield %arg0, %dead
+/// }
+/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+/// -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
+};
+
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
+/// case arguments for the store are passed through the warp op interface they
+/// would be propagated as returned values. Only the source vector for the store
+/// is distributed according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+/// !xegpu.tensor_desc<4x8xf32>
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32>
+/// }
+/// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
+};
+
+/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later with dce). The yield op will
+/// bypass the load's arguments. Only the loaded vector is distributed according
+/// to sg_map attribute and, tensor descriptor types is not distributed.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (vector<4x1xf32>) {
+/// ...
+/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32> ->
+/// vector<4x8xf32>
+/// gpu.yield %ld
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32> ->
+/// vector<4x8xf32> gpu.yield %arg0, %arg1
+/// }
+/// %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
+///
+/// ```
+struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
+};
+
+struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
};
} // namespace
+/// Returns the distributed vector type for a source vector type according to
+/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
+/// corresponding wi_layout dimension. If array_length > 1, that is appended to
+/// the front of the disributed shape.
+/// Examples:
+/// | original vector shape | wi_layout | distributed vector shape |
+/// |-----------------------|-----------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
+ VectorType originalType) {
+ llvm::SmallVector<int64_t, 2> distributedShape;
+ if (!sgMap)
+ return failure();
+
+ auto wiLayout = sgMap.getWiLayout();
+ assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
+ "expecting 2D or 3D shape for the original vector type");
+ assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
+ // Original type can be 2D or 3D (array_length > 1), the last two dims are the
+ // block shape.
+ auto blockShape = originalType.getShape().take_back(2);
+ // Check if the block vector shape can be distributed evenly.
+ if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
+ return failure();
+
+ if (originalType.getRank() == 3) {
+ distributedShape.push_back(originalType.getShape()[0]);
+ }
+ for (unsigned i = 0; i < 2; ++i) {
+ distributedShape.push_back(blockShape[i] / wiLayout[i]);
+ }
+ auto newVectorType =
+ VectorType::get(distributedShape, originalType.getElementType());
+ return newVectorType;
+}
+
+LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
+ gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const {
+ /// If the function already moved inside a warp_execute_on_lane0, skip.
+ if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::WarpExecuteOnLane0Op>(op);
+ }))
+ return failure();
+ /// Create a new function with the same signature.
+ auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+ gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+ /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+ /// original gpuFuncOp.
+ rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+ auto laneId = rewriter.create<gpu::LaneIdOp>(
+ newGpuFunc.getLoc(), rewriter.getIndexType(),
+ /** upperBound = **/ mlir::IntegerAttr());
+ auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+ auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+ laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+ newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+ auto &warpBodyBlock = warpOp.getBodyRegion().front();
+ /// Replace the ReturnOp of the original gpu function with a YieldOp.
+ auto origRetunOp =
+ cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+ rewriter.setInsertionPointAfter(origRetunOp);
+ rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+ origRetunOp.getOperands());
+ rewriter.eraseOp(origRetunOp);
+ /// Move the original function body to the WarpExecuteOnLane0Op body.
+ rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+ warpOp.getBodyRegion().begin());
+ rewriter.eraseBlock(&warpBodyBlock);
+ /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+ rewriter.setInsertionPointAfter(warpOp);
+ rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+ rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+ return success();
+}
+
+LogicalResult
+SubgroupOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const {
+ auto yield = cast<gpu::YieldOp>(
+ subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+ if (!storeOp)
+ return failure();
+
+ auto tensorDescTy = storeOp.getTensorDescType();
+ xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+ if (storeOp.getTensorDescType().getShape().size() != 2)
+ return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+
+ auto distributedTypeOrFailure =
+ getDistributedVectorType(sgMap, storeOp.getValueType());
+ if (failed(distributedTypeOrFailure))
+ return rewriter.notifyMatchFailure(storeOp,
+ "Failed to distribute the type");
+ VectorType newVectorType = distributedTypeOrFailure.value();
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp,
+ /* new yielded values = */
+ ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+ /* new yielded types = */
+ TypeRange{storeOp.getTensorDescType(), newVectorType}, newRetIndices);
+
+ // Create a new store op outside the warp op with the distributed vector type.
+ // Tensor descriptor is not distributed.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto newStoreOp =
+ cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+ rewriter.eraseOp(storeOp);
+ newStoreOp.getTensorDescMutable().assign(
+ newWarpOp.getResult(newRetIndices[0]));
+ newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+
+ return success();
+}
+
+LogicalResult
+SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const {
+ OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
+ return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
+ });
+ if (!operand)
+ return rewriter.notifyMatchFailure(subgroupOp,
+ "warp result is not a xegpu::LoadNd op");
+
+ auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+ xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+ xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+ auto tensorDecShape = tensorDescTy.getShape();
+ if (tensorDecShape.size() != 2)
+ return rewriter.notifyMatchFailure(loadOp,
+ "unsupported tensor descriptor shape");
+
+ auto distributedTypeOrFailure =
+ getDistributedVectorType(sgMap, loadOp.getType());
+ if (failed(distributedTypeOrFailure))
+ return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
+ VectorType newVectorType = distributedTypeOrFailure.value();
+
+ unsigned operandIdx = operand->getOperandNumber();
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+ /* new yielded types = */ TypeRange{tensorDescTy}, newRetIndices);
+
+ // Create a new load op outside the warp op with the distributed vector type.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+ loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
+ loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
+ loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+
+ newLoadOp.getTensorDescMutable().assign(
+ newWarpOp.getResult(newRetIndices[0]));
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+ return success();
+}
+
+LogicalResult
+SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const {
+ OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
+ return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
+ });
+
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+ auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+
+ auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+ if (!srcTypedVal)
+ return rewriter.notifyMatchFailure(
+ descOp, "expecting a memref typed value as the source");
+
+ auto descOffsets = descOp.getMixedOffsets();
+ if (descOffsets.size() != 2)
+ return rewriter.notifyMatchFailure(descOp,
+ "offsets size is expected to be 2");
+
+ xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ descOp, "the tensor descriptor lacks sg_map attribute");
+
+ SmallVector<size_t> newRetIndices;
+ rewriter.setInsertionPoint(subgroupOp);
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, /* new yieled values = */ descOp.getSource(),
+ /* new yielded types = */ descOp.getSourceType(), newRetIndices);
+
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+ newWarpOp.getLoc(), descOp.getType(),
+ dyn_cast<TypedValue<MemRefType>>(newWarpOp.getResult(newRetIndices[0])),
+ descOffsets);
+
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+ return success();
+}
+
+LogicalResult
+SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const {
+ OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
+ return isa<xegpu::DpasOp>(op) && op->hasOneUse();
+ });
+
+ if (!operand)
+ return rewriter.notifyMatchFailure(subgroupOp,
+ "warp result is not a xegpu::Dpas op");
+
+ auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+ xegpu::SGMapAttr sgMapA =
+ mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_a"));
+ xegpu::SGMapAttr sgMapB =
+ mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_b"));
+ xegpu::SGMapAttr sgMapResult =
+ mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_out"));
+ if (!sgMapA || !sgMapB || !sgMapResult)
+ return rewriter.notifyMatchFailure(
+ dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or result");
+
+ auto distributedLhsTypeOrFailure =
+ getDistributedVectorType(sgMapA, dpasOp.getLhsType());
+ auto distributedRhsTypeOrFailure =
+ getDistributedVectorType(sgMapB, dpasOp.getRhsType());
+ auto distributedResultTypeOrFailure =
+ getDistributedVectorType(sgMapResult, dpasOp.getResultType());
+ if (failed(distributedLhsTypeOrFailure) ||
+ failed(distributedRhsTypeOrFailure) ||
+ failed(distributedResultTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ dpasOp,
+ "Failed to distribute the A, B or result types in xegpu::Dpas op");
+
+ llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
+ llvm::SmallVector<Type, 3> newYieldTypes{distributedLhsTypeOrFailure.value(),
+ distributedRhsTypeOrFailure.value()};
+ // Dpas acc operand is optional.
+ if (dpasOp.getAcc()) {
+ newYieldValues.push_back(dpasOp.getAcc());
+ newYieldTypes.push_back(distributedResultTypeOrFailure.value());
+ }
+ // Create a new warp op without the dpas.
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+
+ // Create a new dpas op outside the warp op.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto newDpasOp = cast<xegpu::DpasOp>(*dpasOp.clone());
+ newDpasOp.getLhsMutable().assign(newWarpOp.getResult(newRetIndices[0]));
+ newDpasOp.getRhsMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+ if (dpasOp.getAcc())
+ newDpasOp.getAccMutable().assign(newWarpOp.getResult(newRetIndices[2]));
+ newDpasOp->getOpResult(0).setType(distributedResultTypeOrFailure.value());
+ Value disributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
+
+ return success();
+}
+
namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
>From 4838b524a635e566175aa087440283b909555402 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 16:54:15 +0000
Subject: [PATCH 07/53] extend sg_map from subgroup to workgroup
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 76 ++++--
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 63 +++--
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 16 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 126 ++++------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 157 ++++++++----
mlir/test/Dialect/XeGPU/invalid.mlir | 110 +++++----
mlir/test/Dialect/XeGPU/ops.mlir | 230 ++++++++++--------
7 files changed, 457 insertions(+), 321 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 0136b18ccfa94..7adb9df3c6b25 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -154,33 +154,81 @@ def XeGPU_FenceScopeAttr:
let assemblyFormat = "$value";
}
-def XeGPU_SGMapAttr : XeGPUAttr<"SGMap", "sg_map"> {
+def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level code
+def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
+def XeGPU_ScopeWI: I32EnumAttrCase<"WI", 2, "wi">; // simt level code
+
+def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
+ [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_ScopeAttr
+ : EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
+ let summary = [{Describe the stage of lowering progress}];
+ let assemblyFormat = "``$value";
+}
+
+def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let summary = [{
Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
}];
let description = [{
- To distribute the XeGPU operation to work items, the tensor_desc must be specified with the sg_map
- attribute at the tensor description creation time.
- Within the `sg_map`, `wi_layout` specifies the layout of work items,
- describing the mapping of work items to the tensor.
- wi_layout[0] x wi_layout[1] must be equal to the total number of work items within a subgroup.
- `wi_data` specifies the minimum number of data elements assigned to each work item for a single distribution.
-
- E.g., #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
- In this example, the subgroup has 16 work items in wi_layout=[1, 16],
- each accessing 1 element as specified by wi_data=[1, 1].
+ XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
+ upon the tensor description creation. LayoutAttr contains the following parameters.
+
+ * scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
+ it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
+ contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
+ wi_layout and wi_data, it will be considered as workitem level.
+ * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
+ * sg_data: [optional] specifies the data size accessed per subgroup.
+ * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
+ The first dimension in the order list is the fastest-changing dimension.
+ * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
+ * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
`wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
+
+ E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+ In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
+
+ E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+ In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
+ and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
+
}];
let parameters = (ins
- ArrayRefParameter<"uint32_t">:$wi_layout,
- ArrayRefParameter<"uint32_t">:$wi_data
+ OptionalParameter<"ScopeAttr">: $scope,
+ OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
+ OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
+ OptionalParameter<"DenseI32ArrayAttr">: $order,
+ "DenseI32ArrayAttr": $wi_layout,
+ "DenseI32ArrayAttr": $wi_data
);
+ let extraClassDeclaration = [{
+ bool isForWorkgroupLevel() {
+ if (!getScope())
+ return getSgLayout() && getSgData();
+ return getScope() == ScopeAttr::get(getContext(), Scope::WG);
+ }
+
+ bool isForSubgroupLevel() {
+ return getScope() == ScopeAttr::get(getContext(), Scope::SG);
+ }
+
+ bool isForWorkItemLevel() {
+ if (!getScope())
+ return !getSgLayout() && !getSgData() && !getOrder();
+ return getScope() == ScopeAttr::get(getContext(), Scope::WI);
+ }
+ }];
- let hasCustomAssemblyFormat = 1;
+ let assemblyFormat = "`<` struct(params) `>`";
let genVerifyDecl = 1;
}
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 56b836d707a7d..6b27ae3b2754c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -80,7 +80,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
information e.g., memref<?x?xf16>, the strides information has to be explicitly
passed via the "strides" and "const_strides" argument.
- In SIMT mode, tensor descriptor is augmented with `SGMapAttr` which describes the
+ In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
mapping of the tensor descriptor to the work items.
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
%c0 = arith.constant 0 : index
%c1 = arith.constant 8 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
@@ -306,7 +306,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
fp32 or fp64. It implies that vnni and transpose cannot exit at the
same time.
- In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, result
vector represents the data to be loaded by each work-item.
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32,
- #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
```
@@ -364,7 +364,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
Corresponding cache hint attribute will be masked.
- In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, input
vector represents the data to be stored by each work-item.
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
- #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
Example 2 (SIMT mode):
```
%2 = xegpu.update_nd_offset %1, [0, 16]:
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
@@ -482,7 +482,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
the chunk_size if the chunk size is larger than 1.
In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
- with `SGMapAttr` which describes the mapping of the tensor descriptor to the work items.
+ with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
In this case, the first dimension of the tensor descriptor represents the work-items, and
the second dimension represents the chunk size.
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
%off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
- #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
```
}];
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
let hasVerifier = 1;
}
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
let summary = "prefetches a set of scattered data points to cache";
let description = [{
@@ -623,7 +623,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
The mask operand masks out memory access so that it is safe to pass out-of-boundary
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
- In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, result vector
represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
number of elements.
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<uncached>}
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
- !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
+ !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
vector<16xi1> -> vector<8x1xf32>
```
@@ -704,7 +704,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
introduced on purpose, making sure users are aware of this implicit transformation.
- In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `SGMapAttr`
+ In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
which describes the mapping of the tensor to the work items. In this case, input vector
represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
number of elements.
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
- !xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+ !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
```
}];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
%2 = xegpu.update_offset %1, %off :
!xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
- #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
```
}];
@@ -840,9 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
can be represented as `B: vector<8x16x2xf16>`.
- In SIMT mode, DpasOp expects attributes `sg_map_a`, `sg_map_b`, and `sg_map_c`
- which descibes the data fragment owned by each work-item w.r.t. the tensor
- descriptor these data are loaded from.
+ In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
+ which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
+ these data are loaded from.
Note: on PVC, the hardware can perform load with VNNI transformation when data
element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -853,9 +853,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
XeGPU_DpasOpType : $lhs,
XeGPU_DpasOpType : $rhs,
Optional<XeGPU_Vector2DType>: $acc,
- OptionalAttr<XeGPU_SGMapAttr>:$sg_map_a,
- OptionalAttr<XeGPU_SGMapAttr>:$sg_map_b,
- OptionalAttr<XeGPU_SGMapAttr>:$sg_map_c);
+ OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
+ OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
+ OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
let results = (outs XeGPU_Vector2DType: $result);
let extraClassDeclaration = [{
@@ -876,6 +876,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
VectorType getResultType() {
return getResult().getType();
}
+
+ bool hasAcc() {
+ return getAcc() != nullptr;
+ }
}];
let assemblyFormat = [{
@@ -979,4 +983,21 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
let extraClassDeclaration = extraBaseClassDeclaration;
}
+def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
+ let summary = "Convert the sg layout of the input operand";
+ let description = [{
+ convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
+ }];
+ let arguments = (ins XeGPU_Vector2DType: $source,
+ XeGPU_LayoutAttr: $srcMap,
+ XeGPU_LayoutAttr: $resMap
+ );
+ let results = (outs XeGPU_Vector2DType: $result);
+ let assemblyFormat = [{
+ $source attr-dict `:` type($source)
+ }];
+
+ let hasVerifier = 1;
+}
+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index ccd91a928e1dd..c92ea42efde3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
element-type ::= float-type | integer-type | index-type
dim-list := (static-dim-list `x`)?
static-dim-list ::= decimal-literal `x` decimal-literal
- attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, sg_map `<` wi_layout = value, wi_data = value `>`)?
+ attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
```
Examples:
@@ -78,15 +78,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
- // A TensorDesc with a sg_map
- xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // A TensorDesc with a layout
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
"mlir::Type": $elementType,
OptionalParameter<"mlir::Attribute">: $encoding,
- OptionalParameter<"mlir::Attribute">: $sg_map);
+ OptionalParameter<"mlir::Attribute">: $layout);
let builders = [
TypeBuilderWithInferredContext<(ins
@@ -95,13 +95,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
CArg<"int", "1">: $array_length,
CArg<"bool", "true">: $boundary_check,
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
- CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>,
+ CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>,
TypeBuilderWithInferredContext<(ins
"llvm::ArrayRef<int64_t>": $shape,
"mlir::Type": $elementType,
CArg<"int", "1">: $chunk_size,
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
- CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
+ CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>
];
let extraClassDeclaration = [{
@@ -127,8 +127,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
}
- SGMapAttr getSGMapAttr() const {
- return llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
+ LayoutAttr getLayoutAttr() const {
+ return llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
}
xegpu::MemorySpace getMemorySpace() const {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 78c242571935c..52b9f2c192b3f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -68,73 +68,39 @@ LogicalResult ScatterTensorDescAttr::verify(
}
//===----------------------------------------------------------------------===//
-// XeGPU_SGMapAttr
+// XeGPU_LayoutAttr
//===----------------------------------------------------------------------===//
-namespace {
-template <typename T, unsigned N>
-LogicalResult parseIntArrayField(::mlir::AsmParser &parser,
- llvm::SmallVector<T, N> &result,
- llvm::StringRef fieldName) {
- if (failed(parser.parseKeyword(fieldName))) {
- parser.emitError(parser.getCurrentLocation(),
- "unexpected field name. Expected " + fieldName + ".");
- return failure();
+LogicalResult
+LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
+ ScopeAttr scope,
+ DenseI32ArrayAttr sg_layout,
+ DenseI32ArrayAttr sg_data,
+ DenseI32ArrayAttr order,
+ DenseI32ArrayAttr wi_layout,
+ DenseI32ArrayAttr wi_data) {
+
+ if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
+ return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
}
- if (failed(parser.parseEqual())) {
- parser.emitError(parser.getCurrentLocation(), "expected '=' sign.");
- return failure();
+ if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
+ return emitError() << "expected sg_layout and sg_data being both present or both absent";
}
- auto elemParser = [&]() -> llvm::ParseResult {
- uint32_t elem = 0;
- auto res = parser.parseInteger(elem);
- result.push_back(elem);
- return res;
- };
-
- return parser.parseCommaSeparatedList(AsmParser::Delimiter::Square,
- elemParser, fieldName);
-}
-} // namespace
-
-mlir::Attribute SGMapAttr::parse(::mlir::AsmParser &parser,
- ::mlir::Type attrType) {
- if (failed(parser.parseLess()))
- return {};
-
- llvm::SmallVector<uint32_t, 2> wi_layout, wi_data;
- if (failed(parseIntArrayField(parser, wi_layout, "wi_layout")))
- return {};
-
- if (failed(parser.parseComma()))
- return {};
-
- if (failed(parseIntArrayField(parser, wi_data, "wi_data")))
- return {};
+ if (order) {
+ if (!sg_layout)
+ return emitError() << "expected order being used with sg_layout and sg_data.";
+ if (order.size() != sg_layout.size())
+ return emitError() << "expected order having the same rank as sg_layout and sg_data";
+ }
- return SGMapAttr::getChecked(
- [&]() { return parser.emitError(parser.getNameLoc()); },
- parser.getContext(), wi_layout, wi_data);
-}
+ if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+ return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+ }
-void SGMapAttr::print(::mlir::AsmPrinter &printer) const {
- printer << "<";
- printer.printKeywordOrString("wi_layout");
- printer << " = [" << getWiLayout() << "], ";
- printer.printKeywordOrString("wi_data");
- printer << " = [" << getWiData() << "]";
- printer << ">";
-}
+ if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
+ return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
-LogicalResult
-SGMapAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
- llvm::ArrayRef<uint32_t> wi_layout,
- llvm::ArrayRef<uint32_t> wi_data) {
- if (wi_layout.size() != 2)
- return emitError() << "expected wi_layout of size 2";
- if (wi_data.size() != 2)
- return emitError() << "expected wi_data of size 2";
return success();
}
@@ -146,7 +112,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
llvm::SmallVector<int64_t> shape;
mlir::Type elementType;
mlir::FailureOr<mlir::Attribute> encoding;
- mlir::FailureOr<mlir::Attribute> sg_map;
+ mlir::FailureOr<mlir::Attribute> layout;
// Parse literal '<'
if (parser.parseLess())
@@ -169,8 +135,8 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
mlir::Attribute attr;
ParseResult res = parser.parseAttribute(attr);
if (mlir::succeeded(res)) {
- if (mlir::isa<SGMapAttr>(attr)) {
- sg_map = attr;
+ if (mlir::isa<LayoutAttr>(attr)) {
+ layout = attr;
continue;
}
if (mlir::isa<BlockTensorDescAttr, ScatterTensorDescAttr>(attr)) {
@@ -188,7 +154,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
return TensorDescType::getChecked(
[&]() { return parser.emitError(parser.getNameLoc()); },
parser.getContext(), shape, elementType,
- encoding.value_or(mlir::Attribute()), sg_map.value_or(mlir::Attribute()));
+ encoding.value_or(mlir::Attribute()), layout.value_or(mlir::Attribute()));
}
void TensorDescType::print(::mlir::AsmPrinter &printer) const {
@@ -208,8 +174,8 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
if (auto encoding = getEncoding())
printer << ", " << encoding;
- if (auto sg_map = getSgMap())
- printer << ", " << sg_map;
+ if (auto layout = getLayout())
+ printer << ", " << layout;
printer << ">";
}
@@ -218,29 +184,29 @@ TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, int array_length,
bool boundary_check,
MemorySpace memory_space,
- mlir::Attribute sg_map) {
+ mlir::Attribute layout) {
auto context = elementType.getContext();
auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
boundary_check);
- return Base::get(context, shape, elementType, attr, sg_map);
+ return Base::get(context, shape, elementType, attr, layout);
}
TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
mlir::Type elementType, int chunk_size,
MemorySpace memory_space,
- mlir::Attribute sg_map) {
+ mlir::Attribute layout) {
auto context = elementType.getContext();
auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
- return Base::get(context, shape, elementType, attr, sg_map);
+ return Base::get(context, shape, elementType, attr, layout);
}
LogicalResult TensorDescType::verify(
llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
- mlir::Attribute encoding, mlir::Attribute sg_map) {
+ mlir::Attribute encoding, mlir::Attribute layout) {
size_t rank = shape.size();
// Low-pressure types are packed in 32-bit units.
- unsigned packingFactor = 32 / elementType.getIntOrFloatBitWidth();
+ int32_t packingFactor = 32 / elementType.getIntOrFloatBitWidth();
if (rank != 1 && rank != 2)
return emitError() << "expected 1D or 2D tensor";
@@ -274,9 +240,9 @@ LogicalResult TensorDescType::verify(
return emitError() << "SLM is not supported for 2D block tensor";
}
- if (auto sgMapAttr = llvm::dyn_cast_if_present<SGMapAttr>(sg_map)) {
- ArrayRef<uint32_t> wiLayout = sgMapAttr.getWiLayout();
- ArrayRef<uint32_t> wiData = sgMapAttr.getWiData();
+ if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
+ ArrayRef<int32_t> wiLayout = layoutAttr.getWiLayout().asArrayRef();
+ ArrayRef<int32_t> wiData = layoutAttr.getWiData().asArrayRef();
if (rank == 1) {
if (wiLayout[0] != 1 || wiData[0] != 1)
@@ -318,7 +284,7 @@ LogicalResult TensorDescType::verify(
return success();
}
-// If tensor descriptor has a sg_map attribute it is used in SIMT mode.
+// If tensor descriptor has a layout attribute it is used in SIMT mode.
// In this mode, the distributed vector shape is determined as follows:
// Definitions:
// wi_data_size = wi_data[0] × wi_data[1]
@@ -343,13 +309,13 @@ LogicalResult TensorDescType::verify(
// Distributed vector shape must be:
// [n_distribution_units, wi_data_size]
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
- auto sgMap = llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
- // If no sg_map is provided, tensor desc is not used in SIMT mode.
- if (!sgMap)
+ auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
+ // If no layout is provided, tensor desc is not used in SIMT mode.
+ if (!layout || !layout.isForWorkItemLevel())
return failure();
- SmallVector<int64_t> wiData(sgMap.getWiData());
- SmallVector<int64_t> wiLayout(sgMap.getWiLayout());
+ SmallVector<int64_t> wiData(layout.getWiData().asArrayRef());
+ SmallVector<int64_t> wiLayout(layout.getWiLayout().asArrayRef());
auto tdescShape = getShape();
auto wiDataSize = 1, sgSize = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3bdf3fb218b45..c7e863256f235 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -78,18 +78,18 @@ static LogicalResult
isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
ArrayRef<int64_t> adjustedTdescShape,
function_ref<InFlightDiagnostic()> emitError) {
- auto sgMap = tdescTy.getSGMapAttr();
+ auto layout = tdescTy.getLayoutAttr();
auto valueShape = valueTy.getShape();
- // sg_map not present means IR is in SIMD mode. In this case value shape must
+ // layout not present means IR is in SIMD mode. In this case value shape must
// match adjusted tensor descriptor shape.
- if (!sgMap)
+ if (!layout || !layout.isForWorkItemLevel())
return valueShape == adjustedTdescShape
? success()
: emitError()
<< "Value shape " << makeString(valueShape)
<< " is not consistent with tensor descriptor " << tdescTy;
- // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+ // layout present means IR is in SIMT mode. In this case layout determines the
// value shape.
auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
assert(succeeded(expectedValueShapeOrFailure) &&
@@ -105,6 +105,25 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
<< " for tensor descriptor " << tdescTy;
}
+static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
+ xegpu::LayoutAttr attr) {
+ assert(attr && "workgroup map attribute is missing.");
+ llvm::ArrayRef<int32_t> layout, data;
+ if (attr.getSgLayout()) {
+ data = attr.getSgData().asArrayRef();
+ layout = attr.getSgLayout().asArrayRef();
+ } else {
+ data = attr.getWiData().asArrayRef();
+ layout = attr.getWiLayout().asArrayRef();
+ }
+ for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
+ // check s % (d * l) != 0
+ if (s % d != 0 || (s / d) % l != 0)
+ return false;
+ }
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_CreateNdDescOp
//===----------------------------------------------------------------------===//
@@ -541,7 +560,7 @@ LogicalResult StoreScatterOp::verify() {
[&]() { return emitOpError(); });
}
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
// XeGPU_UpdateOffsetOp
//===----------------------------------------------------------------------===//
void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
@@ -569,61 +588,107 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
LogicalResult DpasOp::verify() {
int64_t lhsRank = getLhsType().getRank();
int64_t rhsRank = getRhsType().getRank();
- int64_t resultRank = getResultType().getRank();
+ int64_t resRank = getResultType().getRank();
auto lhsShape = getLhsType().getShape();
auto rhsShape = getRhsType().getShape();
- auto resultShape = getResultType().getShape();
-
- auto sgMapA = getSgMapAAttr();
- auto sgMapB = getSgMapBAttr();
- auto sgMapC = getSgMapCAttr();
+ auto resShape = getResultType().getShape();
+
+ auto layoutA = getALayoutAttr();
+ auto layoutB = getBLayoutAttr();
+ auto layoutC = getCLayoutAttr();
+
+ // make sure the layout attribute is either set for every available
+ // operand or simply not set at all. C is special, since ACC is optional.
+ // If they are all set, they also should be in the same scope.
+ auto isValidSet = [&]() {
+ bool result = (layoutA != nullptr) ^ (layoutB != nullptr);
+ if (hasAcc()) {
+ result |= (layoutA != nullptr) ^ (layoutC != nullptr);
+ }
+ result = !result;
- // If sg_maps are not present, then the operation is in SIMD mode.
- if (!sgMapA && !sgMapB && !sgMapC) {
- if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+ if (layoutA) {
+ auto scope = layoutA.getScope();
+ result &= layoutB ? scope == layoutB.getScope() : false;
+ if (hasAcc())
+ result &= layoutC ? scope == layoutC.getScope() : false;
+ }
+ return result;
+ };
+
+ if (!isValidSet())
+ return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+
+ // query the scope from layoutA (a valid setting).
+ if (layoutA && layoutA.isForWorkItemLevel()) {
+ // In SIMT mode, All data fragments must be 2D
+ if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+ return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+ auto wiLayoutA = layoutA.getWiLayout();
+ auto wiLayoutB = layoutB.getWiLayout();
+ auto wiLayoutC = layoutC.getWiLayout();
+ // Obtain the expanded shapes of the operands and result using wi_layout.
+ // NOTE: For B, get rid of the packed dimension for the expanded shape.
+ SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+ lhsShape[1] * wiLayoutA[1]};
+ SmallVector<int64_t> expandedShapeB = {
+ rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+ SmallVector<int64_t> expandedShapeC = {resShape[0] * wiLayoutC[0],
+ resShape[1] * wiLayoutC[1]};
+ auto bK = expandedShapeB[0];
+ if (bK != expandedShapeA[1])
+ return emitOpError("K-dimension mismatch.");
+ if (expandedShapeA[0] != expandedShapeC[0])
+ return emitOpError("M-dimension mismatch.");
+ if (expandedShapeB[1] != expandedShapeC[1])
+ return emitOpError("N-dimension mismatch.");
+ } else { // For other scopes, operands' shape should match the mxkxn semantics.
+ if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
return emitOpError(
"expecting lhs and result to be a 2D vector, and rhs to be either "
"2D or 3D (packed) vector.");
auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
if (bK != lhsShape[1])
return emitOpError("K-dimension mismatch.");
- if (lhsShape[0] != resultShape[0])
+ if (lhsShape[0] != resShape[0])
return emitOpError("M-dimension mismatch.");
- if (rhsShape[1] != resultShape[1])
+ if (rhsShape[1] != resShape[1])
return emitOpError("N-dimension mismatch.");
- return success();
}
- // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
- // result of DPAS operation.
- if (!sgMapA || !sgMapB || !sgMapC)
- return emitOpError("sg_map attributes for all operands and outputs are "
- "expected in SIMT xegpu::Dpas operation");
-
- // In SIMT mode, All data fragments must be 2D
- if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
- return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
- auto wiLayoutA = sgMapA.getWiLayout();
- auto wiLayoutB = sgMapB.getWiLayout();
- auto wiLayoutC = sgMapC.getWiLayout();
- // Obtain the expanded shapes of the operands and result using wi_layout.
- // NOTE: For B, get rid of the packed dimension for the expanded shape.
- SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
- lhsShape[1] * wiLayoutA[1]};
- SmallVector<int64_t> expandedShapeB = {
- rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
- SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
- resultShape[1] * wiLayoutC[1]};
- auto bK = expandedShapeB[0];
- if (bK != expandedShapeA[1])
- return emitOpError("K-dimension mismatch.");
- if (expandedShapeA[0] != expandedShapeC[0])
- return emitOpError("M-dimension mismatch.");
- if (expandedShapeB[1] != expandedShapeC[1])
- return emitOpError("N-dimension mismatch.");
-
return success();
}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_ConvertLayoutOp
+//===----------------------------------------------------------------------===//
+LogicalResult ConvertLayoutOp::verify() {
+ auto srcMap = getSrcMapAttr();
+ auto resMap = getResMapAttr();
+ if (!srcMap)
+ return emitOpError("expected srcMap.");
+ if (!resMap)
+ return emitOpError("expected resMap.");
+
+ if (srcMap.getScope() != resMap.getScope())
+ return emitOpError("expected srcMap and resMap be in the same scope.");
+
+ if (srcMap == resMap)
+ return emitOpError("expected different srcMap and resMap.");
+
+ if (srcMap.isForWorkItemLevel())
+ return emitOpError("doesn't work on SIMT code.");
+
+ auto shape = getSource().getType().getShape();
+ if (!isEvenDistributed(shape, srcMap))
+ return emitOpError("invalid srcMap, data cannot be evenly distributed.");
+
+ if (!isEvenDistributed(shape, resMap))
+ return emitOpError("invalid resMap, data cannot be evenly distributed.");
+
+ return mlir::success();
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 88e9bbf78945b..c4958d920a89f 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -78,25 +78,25 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
}
// -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-> vector<8x2xf32>
return
}
// -----
-func.func @test_load_nd_sg_map(%src: memref<24x32xf32>) {
+func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
-> vector<8xf32>
return
}
@@ -134,22 +134,22 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
}
// -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
// -----
-func.func @test_store_nd_sg_map(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
+func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
@@ -245,69 +245,69 @@ func.func @test_prefetch_vc_2(%src: ui64) {
}
// -----
-func.func @test_create_tdesc_sg_map_1(%src: ui64) {
+func.func @test_create_tdesc_layout_1(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
return
}
// -----
-func.func @test_create_tdesc_sg_map_2(%src: ui64) {
+func.func @test_create_tdesc_layout_2(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [2, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [2, 1]>>
return
}
// -----
-func.func @test_create_tdesc_sg_map_3(%src: ui64) {
+func.func @test_create_tdesc_layout_3(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
return
}
// -----
-func.func @test_load_gather_sg_map_1(%src: ui64) {
+func.func @test_load_gather_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
return
}
// -----
-func.func @test_load_gather_sg_map_2(%src: ui64) {
+func.func @test_load_gather_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
return
}
// -----
-func.func @test_store_scatter_sg_map_1(%src: ui64) {
+func.func @test_store_scatter_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<1x2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
return
}
// -----
-func.func @test_store_scatter_sg_map_2(%src: ui64) {
+func.func @test_store_scatter_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
return
}
@@ -394,18 +394,18 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
}
// -----
-func.func @test_dpas_sg_map_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // expected-error at +1 {{sg_map attributes for all operands and outputs are expected in SIMT xegpu::Dpas operation}}
- %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
+ // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
return
}
// -----
-func.func @test_dpas_sg_map_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
+func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
// expected-error at +1 {{K-dimension mismatch}}
- %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
- sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
- sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+ b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+ c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
: vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
return
}
@@ -439,7 +439,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [2, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [2, 16], wi_data = [1, 1]>>
return
}
@@ -447,7 +447,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
return
}
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
return
}
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 1]>>
return
}
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [4, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [2, 8], wi_data = [4, 1]>>
return
}
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map<wi_layout = [8, 2], wi_data = [1, 2]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 2]>>
return
}
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
!xegpu.tensor_desc<4x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.sg_map<wi_layout = [1, 1], wi_data = [2, 1]>>
+ #xegpu.layout<scope = wi, wi_layout = [1, 1], wi_data = [2, 1]>>
return
}
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+ #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
return
}
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 2]>>
+ #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
return
}
@@ -520,6 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected chunk blocks for 2D tensor}}
!xegpu.tensor_desc<16x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.sg_map<wi_layout = [8, 1], wi_data = [1, 2]>>
+ #xegpu.layout<scope = wi, wi_layout = [8, 1], wi_data = [1, 2]>>
return
}
+
+// -----
+func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+ // expected-error at +1 {{expected different srcMap and resMap}}
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>,
+ resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
+
+// -----
+func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+ // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+ resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index c32f1905454b6..6a29a73a20612 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
%2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
gpu.return
}
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
gpu.return
}
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
gpu.return
}
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
gpu.return
}
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
gpu.return
}
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
gpu.return
}
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
%1 = arith.constant dense<1.0>: vector<48x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
%1 = arith.constant dense<1.0>: vector<2x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
gpu.return
}
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
gpu.func @test_create_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
gpu.return
}
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
gpu.return
}
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
gpu.return
}
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
gpu.func @test_create_tdesc_simt_3(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
gpu.return
}
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
gpu.return
}
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
gpu.return
}
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
gpu.return
}
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
%2 = arith.constant dense<2.9>: vector<2x1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
%2 = arith.constant dense<2.9>: vector<1x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
gpu.return
}
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
%2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map<wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
gpu.func @test_prefetch_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
gpu.return
}
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
gpu.func @test_create_update_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
//CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
%s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.sg_map<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
gpu.return
}
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
- // CHECK: sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
- // CHECK: sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
- %1 = xegpu.dpas %a, %b {sg_map_a = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>,
- sg_map_b = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [2, 1]>,
- sg_map_c = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>}
+ // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+ // CHECK: b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+ // CHECK: c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
+ b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
+ c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
: vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
gpu.return
}
@@ -704,4 +704,24 @@ gpu.func @fence() {
gpu.return
}
+// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+ gpu.return
+}
+
+gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [2, 1]>,
+ resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
+
+gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
+ resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ gpu.return
+}
+
+
}
>From cb2697927bc75b00abd03a39ffb0698ba8b9e0a4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 17:14:59 +0000
Subject: [PATCH 08/53] format code
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 35 ++++++++++++----------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 7 +++--
2 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 52b9f2c192b3f..5e21bb805a6a5 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -72,34 +72,39 @@ LogicalResult ScatterTensorDescAttr::verify(
//===----------------------------------------------------------------------===//
LogicalResult
LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
- ScopeAttr scope,
- DenseI32ArrayAttr sg_layout,
- DenseI32ArrayAttr sg_data,
- DenseI32ArrayAttr order,
- DenseI32ArrayAttr wi_layout,
- DenseI32ArrayAttr wi_data) {
-
- if (scope && scope.getValue() != Scope::WG && (sg_layout || sg_data || order)) {
- return emitError() << "expected sg_layout, sg_data, and order being only used at workgroup level.";
+ ScopeAttr scope, DenseI32ArrayAttr sg_layout,
+ DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
+ DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
+
+ if (scope && scope.getValue() != Scope::WG &&
+ (sg_layout || sg_data || order)) {
+ return emitError() << "expected sg_layout, sg_data, and order being only "
+ "used at workgroup level.";
}
if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
- return emitError() << "expected sg_layout and sg_data being both present or both absent";
+ return emitError() << "expected sg_layout and sg_data being both present "
+ "or both absent";
}
if (order) {
if (!sg_layout)
- return emitError() << "expected order being used with sg_layout and sg_data.";
+ return emitError()
+ << "expected order being used with sg_layout and sg_data.";
if (order.size() != sg_layout.size())
- return emitError() << "expected order having the same rank as sg_layout and sg_data";
+ return emitError()
+ << "expected order having the same rank as sg_layout and sg_data";
}
- if (sg_layout && (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
- return emitError() << "expected sg_layout and sg_data having the same rank, which is not larger than 2";
+ if (sg_layout &&
+ (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
+ return emitError() << "expected sg_layout and sg_data having the same "
+ "rank, which is not larger than 2";
}
if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
- return emitError() << "expected wi_layout and wi_data having the same rank, which is not larger than 2";
+ return emitError() << "expected wi_layout and wi_data having the same "
+ "rank, which is not larger than 2";
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index c7e863256f235..66b5054278c8c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -617,7 +617,9 @@ LogicalResult DpasOp::verify() {
};
if (!isValidSet())
- return emitOpError("layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code).");
+ return emitOpError(
+ "layout attributes should be either set for all operands (for SIMT "
+ "code) or not set at all (for SIMD code).");
// query the scope from layoutA (a valid setting).
if (layoutA && layoutA.isForWorkItemLevel()) {
@@ -643,7 +645,8 @@ LogicalResult DpasOp::verify() {
return emitOpError("M-dimension mismatch.");
if (expandedShapeB[1] != expandedShapeC[1])
return emitOpError("N-dimension mismatch.");
- } else { // For other scopes, operands' shape should match the mxkxn semantics.
+ } else { // For other scopes, operands' shape should match the mxkxn
+ // semantics.
if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
return emitOpError(
"expecting lhs and result to be a 2D vector, and rhs to be either "
>From 273fc408a1c63fe3d4100708cad190f01b6d2523 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 17:19:07 +0000
Subject: [PATCH 09/53] remove changes to prefetch op
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6b27ae3b2754c..a3ee6e901a775 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -571,7 +571,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
let hasVerifier = 1;
}
-def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [MemoryEffects<[MemRead]>]> {
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
let summary = "prefetches a set of scattered data points to cache";
let description = [{
>From 504d2748efb1ad3d29a3187a5e692d58247a3bdd Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 18:06:52 +0000
Subject: [PATCH 10/53] refine the doc for TensorDesc
---
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 43 +++++++++++--------
1 file changed, 24 insertions(+), 19 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index c92ea42efde3b..82d6a4ec39e6b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -34,27 +34,24 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
[ShapedTypeInterface], "::mlir::TensorType"> {
let summary = "TensorDesc describing regions of interested data.";
let description = [{
- TensorDesc is a type designed to describe regions of the interested data as well as some
- features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
- it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
- to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
- It encodes the following information:
+ TensorDesc is a type designed to describe regions of interest in data, as well as some features
+ unique to Intel hardware. Unlike the built-in tensor type in MLIR, it essentially contains only
+ metadata and does not hold the data itself. It is primarily designed to support 2D block load/store
+ and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information:
* shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
and each row contains 16 contiguous data element. The rows could be
- either contiguous or not, depends on whether the encoding attribute
- is set or not.
- * element_type: the data type of the data element, e.g., f16, f32.
+ either contiguous or not, depends on the encoding attribute. If the
+ encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding
+ is a ScatterTensorDescAttr, rows are not necessary to be contiguous. If
+ encoding is not set, it is considered as a default BlockTensorDescAttr.
- Similar to the builtin tensor, it also provides an optinal attribute to encoding
- the following information via the TensorDescAttr object:
- * memory_space (xegpu::MemorySpace): [optional] where the data is located,
- global memory or shared memory. It is default to Global.
- * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
- that will be loaded by block load at a time. It is default to 1.
- * boundary_check (bool): [optional] indicates whether the operation detects the boundary
- and pads with zero for out-of-boundary access. It is default to do boundary check.
+ * element_type: the data type of the data element, e.g., f16, f32.
+ Similar to the built-in tensor, it also provides optional attributes for encoding
+ additional information via either BlockTensorDescAttr or ScatterTensorDescAttr, or
+ supporting Workgroup, Subgroup, and workitem (or SIMT) level programmings via the
+ Layout attribute. Please check their definition for details.
Syntax:
@@ -63,7 +60,9 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
element-type ::= float-type | integer-type | index-type
dim-list := (static-dim-list `x`)?
static-dim-list ::= decimal-literal `x` decimal-literal
- attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? (, layout `<` wi_layout = value, wi_data = value `>`)?
+ attr-list = (, encoding-attr)? (, layout-attr)?
+ enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
+ layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? wi_layout = value, wi_data = value `>`)?
```
Examples:
@@ -78,8 +77,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
- // A TensorDesc with a layout
- xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+ // A TensorDesc with a layout for workgroup level programming
+ xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
+
+ // A TensorDesc with a layout for subgroup level programming
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>>
+
+ // A TensorDesc with a layout for workitem level programming
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
```
}];
>From 90e070493d85af6fbdd31bb78b0e12b2a726ce49 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 19:41:57 +0000
Subject: [PATCH 11/53] save work
---
.../Dialect/XeGPU/Transforms/Transforms.h | 2 +
.../Transforms/XeGPUSubgroupDistribute.cpp | 57 ++++++++++++-------
2 files changed, 39 insertions(+), 20 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 63ea26df06937..3e94021c7a1ea 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -16,6 +16,8 @@ namespace xegpu {
/// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU SIMT distribution into `patterns`.
+void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index db8c321487a1c..925ba88a7a1db 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -753,7 +753,7 @@ namespace {
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
/// ...
/// ...
-/// gpu.yield %result: vector<8x16xf32>
+/// gpu.return %result: vector<8x16xf32>
/// }
/// ```
/// To
@@ -1075,9 +1075,6 @@ SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
descOp, "expecting a memref typed value as the source");
auto descOffsets = descOp.getMixedOffsets();
- if (descOffsets.size() != 2)
- return rewriter.notifyMatchFailure(descOp,
- "offsets size is expected to be 2");
xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
if (!sgMap)
@@ -1085,16 +1082,26 @@ SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
descOp, "the tensor descriptor lacks sg_map attribute");
SmallVector<size_t> newRetIndices;
+ SmallVector<Value> newYieldValues;
+ SmallVector<Type> newYieldTypes;
+
+ for (auto arg : descOp->getOperands()) {
+ newYieldValues.push_back(arg);
+ newYieldTypes.push_back(arg.getType());
+ }
rewriter.setInsertionPoint(subgroupOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, subgroupOp, /* new yieled values = */ descOp.getSource(),
- /* new yielded types = */ descOp.getSourceType(), newRetIndices);
+ rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+ /* new yielded types = */ newYieldTypes, newRetIndices);
+ SmallVector<Value> newDescOperands;
+ for (auto i : newRetIndices) {
+ newDescOperands.push_back(newWarpOp.getResult(i));
+ }
rewriter.setInsertionPointAfter(newWarpOp);
auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
- newWarpOp.getLoc(), descOp.getType(),
- dyn_cast<TypedValue<MemRefType>>(newWarpOp.getResult(newRetIndices[0])),
- descOffsets);
+ newWarpOp.getLoc(), descOp.getType(), newDescOperands,
+ descOp->getAttrs());
Value distributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(distributedVal, newDescOp);
@@ -1119,7 +1126,7 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
xegpu::SGMapAttr sgMapB =
mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_b"));
xegpu::SGMapAttr sgMapResult =
- mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_out"));
+ mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_c"));
if (!sgMapA || !sgMapB || !sgMapResult)
return rewriter.notifyMatchFailure(
dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or result");
@@ -1177,6 +1184,12 @@ struct XeGPUSubgroupDistributePass final
};
} // namespace
+void xegpu::populateXeGPUSubgroupDistributePatterns(
+ RewritePatternSet &patterns) {
+ patterns.add<SubgroupOpTensorDescOp, SubgroupOpStoreNd, SubgroupOpLoadNd,
+ SubgroupOpDpas>(patterns.getContext());
+}
+
void XeGPUSubgroupDistributePass::runOnOperation() {
auto &analyis = getAnalysis<RunSGMapPropagation>();
// Print the analysis result and exit. (for testing purposes)
@@ -1192,14 +1205,18 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
signalPassFailure();
/// Move all operations inside a GPU functions inside
/// gpu.warp_execute_on_lane0
- {
- RewritePatternSet patterns(&getContext());
- patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
- /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
- /// region.
- GreedyRewriteConfig config;
- config.cseConstants = false;
- config.fold = false;
- (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
- }
+
+ RewritePatternSet patterns(&getContext());
+ patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+ /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+ /// region.
+ GreedyRewriteConfig config;
+ config.cseConstants = false;
+ config.fold = false;
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+
+ /// Finally, do the SIMD to SIMT distribution.
+ patterns.clear();
+ xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
}
>From 3abe7cb1655d3519f54dfde94015cd7f9a40c9be Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 20:25:27 +0000
Subject: [PATCH 12/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 20 +++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 925ba88a7a1db..0a7edb441f981 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -311,6 +311,11 @@ class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
void setToExitState(SGMapLattice *lattice) override {
(void)lattice->meet(SGMap());
}
+
+ LogicalResult initialize(Operation *top) override {
+ llvm::errs() << "SGMapPropagation::initialize\n";
+ return success();
+ }
};
} // namespace
@@ -581,8 +586,8 @@ class RunSGMapPropagation {
public:
RunSGMapPropagation(Operation *op) : target(op) {
SymbolTableCollection symbolTable;
- solver.load<DeadCodeAnalysis>();
- solver.load<SparseConstantPropagation>();
+ // solver.load<DeadCodeAnalysis>();
+ // solver.load<SparseConstantPropagation>();
solver.load<SGMapPropagation>(symbolTable);
(void)solver.initializeAndRun(op);
}
@@ -679,6 +684,7 @@ void attachLayoutAttributeToUsers(Value v, Attribute layout) {
static LogicalResult
attachLayoutAttributes(Operation *top,
llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
+ llvm::errs() << "op name : " << top->getName() << "\n";
/// Helper to convert SGMap to xegpu::SGMapAttr.
auto getSGMapForResult = [&](Value r) -> Attribute {
auto layout = getPropagatedLayout(r);
@@ -694,6 +700,16 @@ attachLayoutAttributes(Operation *top,
};
/// Attach the layout attributes to the results of the operations.
auto walkResult = top->walk([&](Operation *op) {
+ /// For function ops, propagate the argument layout to the users.
+ if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+ for (auto arg : func.getArguments()) {
+ auto sgMapAttr = getSGMapForResult(arg);
+ if (sgMapAttr) {
+ attachLayoutAttributeToUsers(arg, sgMapAttr);
+ }
+ }
+ return WalkResult::advance();
+ }
/// If no results, move on.
if (op->getNumResults() == 0)
return WalkResult::advance();
>From 596c953468e4c4c91f59975563594f52640df070 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 21 Mar 2025 21:27:56 +0000
Subject: [PATCH 13/53] update doc
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 40 ++++++++++--------
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 41 ++++++++++---------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 28 ++++++-------
3 files changed, 59 insertions(+), 50 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 7adb9df3c6b25..8eb1b99c9d2c3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -158,30 +158,33 @@ def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level cod
def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
def XeGPU_ScopeWI: I32EnumAttrCase<"WI", 2, "wi">; // simt level code
-def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumerate of scope",
+def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
[XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}
def XeGPU_ScopeAttr
- : EnumAttr<XeGPU_Dialect,XeGPU_ScopeEnums, "Stage"> {
- let summary = [{Describe the stage of lowering progress}];
+ : EnumAttr<XeGPU_Dialect, XeGPU_ScopeEnums, "Scope"> {
+ let summary = [{Describe the programming scope of the IR}];
let assemblyFormat = "``$value";
}
def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let summary = [{
- Describes the mapping between work item (WI) and the 2D tensor specified by the tensor descriptor.
+ Describes the data distribution to subgroups and work-items for a tensor
+ specified by the tensor descriptor.
}];
let description = [{
- XeGPU operations leverages LayoutAttr to distribute data across work-item. It is specified in tensor_descs
- upon the tensor description creation. LayoutAttr contains the following parameters.
-
- * scope: specifies the scope of current code. It can be either wg (workgroup), sg (subgroup) or wi (workitem).
- it is hard required for subgroup, but optional for workgroup and wi. By default, if a LayoutAttr
- contains sg_layout and sg_data, it will be treated as workgroup code; and if it only contains
- wi_layout and wi_data, it will be considered as workitem level.
+ XeGPU operations leverages LayoutAttr to distribute data across subgroups and workitems.
+ It is specified in tensor_descs upon the tensor description creation. LayoutAttr contains
+ the following parameters.
+
+ * scope: Specifies the scope of the current code, which can be either wg (workgroup), sg (subgroup),
+ or wi (workitem). It is mandatory for subgroup-level programming and optional for workgroup
+ and workitem-level programming. By default, if a LayoutAttr includes sg_layout and sg_data,
+ it will be treated as workgroup level. Similarly, if it only includes wi_layout and wi_data,
+ it will be considered as workitem level.
* sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
* sg_data: [optional] specifies the data size accessed per subgroup.
* order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
@@ -189,16 +192,19 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
* wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
* wi_data: [required] specifies the data size accessed per work-item for a single distribution.
- `wi_data[0] * wi_data[1]` can be greater than 1, meaning that each work item operates on multiple elements,
- which is eventually lowered to "SIMT-flavor" vector, like SPIR-V vector or llvm vector, or packed to a storage data type.
- The multiple elements indicated by `wi_data` can only be from one dimension and must be contiguous in the memory along either dimension.
+ `wi_data[0] * wi_data[1]` can be greater than 1, indicating that each work item operates on multiple
+ elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
+ an LLVM vector, or packed into a storage data type. The multiple elements specified by wi_data must
+ come from a single dimension and be contiguous in memory along either dimension.
E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
- In this example, the subgroup has 16 work items in wi_layout=[1, 16], each accessing 1 element as specified by wi_data=[1, 1].
+ In this example, the subgroup consists of 16 work items arranged as wi_layout=[1, 16], with
+ each work item accessing a single element as defined by wi_data=[1, 1].
E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
- In this example, the layout representing a workgroup work distribution. A workgroup has 8 subgroups organized as 2x4 layout.
- and each subgroup accesses a 16x16 block per instruction, which is further disbributed to 16 work items as described above.
+ In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
+ arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
+ distributed to 16 work items as described above.
}];
let parameters = (ins
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 5e21bb805a6a5..9557a06e8e2a4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -76,35 +76,38 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
- if (scope && scope.getValue() != Scope::WG &&
- (sg_layout || sg_data || order)) {
- return emitError() << "expected sg_layout, sg_data, and order being only "
- "used at workgroup level.";
- }
-
- if ((sg_layout != nullptr) ^ (sg_data != nullptr)) {
- return emitError() << "expected sg_layout and sg_data being both present "
- "or both absent";
+ if (sg_data) {
+ if (!sg_layout)
+ return emitError() << "expected sg_layout being used with sg_data.";
+ if (sg_data.size() != sg_layout.size())
+ return emitError() << "expected sg_data having the same rank as sg_layout";
}
if (order) {
if (!sg_layout)
- return emitError()
- << "expected order being used with sg_layout and sg_data.";
+ return emitError() << "expected order being used with sg_layout.";
if (order.size() != sg_layout.size())
- return emitError()
- << "expected order having the same rank as sg_layout and sg_data";
+ return emitError() << "expected order having the same rank as sg_layout";
+ }
+
+ if (sg_layout && sg_layout.size() > 2) {
+ return emitError() << "expected the rank of the layout to be at most 2";
}
- if (sg_layout &&
- (sg_layout.size() != sg_data.size() || sg_layout.size() > 2)) {
- return emitError() << "expected sg_layout and sg_data having the same "
- "rank, which is not larger than 2";
+ if (scope && scope.getValue() != Scope::WG &&
+ (sg_layout || sg_data || order)) {
+ return emitError() << "expected sg_layout, sg_data, or order being only "
+ "used at workgroup level.";
}
- if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2)
+ if (scope && scope.getValue() == Scope::WG && !sg_layout ) {
+ return emitError() << "expected sg_layout for workgroup level layout";
+ }
+
+ if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2) {
return emitError() << "expected wi_layout and wi_data having the same "
- "rank, which is not larger than 2";
+ "rank, with a maximum rank of 2";
+ }
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 66b5054278c8c..59faa1d31454d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -593,25 +593,25 @@ LogicalResult DpasOp::verify() {
auto rhsShape = getRhsType().getShape();
auto resShape = getResultType().getShape();
- auto layoutA = getALayoutAttr();
- auto layoutB = getBLayoutAttr();
- auto layoutC = getCLayoutAttr();
+ auto aLayout = getALayoutAttr();
+ auto bLayout = getBLayoutAttr();
+ auto cLayout = getCLayoutAttr();
// make sure the layout attribute is either set for every available
// operand or simply not set at all. C is special, since ACC is optional.
// If they are all set, they also should be in the same scope.
auto isValidSet = [&]() {
- bool result = (layoutA != nullptr) ^ (layoutB != nullptr);
+ bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
if (hasAcc()) {
- result |= (layoutA != nullptr) ^ (layoutC != nullptr);
+ result |= (aLayout != nullptr) ^ (cLayout != nullptr);
}
result = !result;
- if (layoutA) {
- auto scope = layoutA.getScope();
- result &= layoutB ? scope == layoutB.getScope() : false;
+ if (aLayout) {
+ auto scope = aLayout.getScope();
+ result &= bLayout ? scope == bLayout.getScope() : false;
if (hasAcc())
- result &= layoutC ? scope == layoutC.getScope() : false;
+ result &= cLayout ? scope == cLayout.getScope() : false;
}
return result;
};
@@ -621,15 +621,15 @@ LogicalResult DpasOp::verify() {
"layout attributes should be either set for all operands (for SIMT "
"code) or not set at all (for SIMD code).");
- // query the scope from layoutA (a valid setting).
- if (layoutA && layoutA.isForWorkItemLevel()) {
+ // query the scope from aLayout (a valid setting).
+ if (aLayout && aLayout.isForWorkItemLevel()) {
// In SIMT mode, All data fragments must be 2D
if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
- auto wiLayoutA = layoutA.getWiLayout();
- auto wiLayoutB = layoutB.getWiLayout();
- auto wiLayoutC = layoutC.getWiLayout();
+ auto wiLayoutA = aLayout.getWiLayout();
+ auto wiLayoutB = bLayout.getWiLayout();
+ auto wiLayoutC = cLayout.getWiLayout();
// Obtain the expanded shapes of the operands and result using wi_layout.
// NOTE: For B, get rid of the packed dimension for the expanded shape.
SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
>From 2065764a5b76f76ad543a4c92a4d0112e38691a4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 21 Mar 2025 21:42:11 +0000
Subject: [PATCH 14/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 51 ++++++++-----------
1 file changed, 22 insertions(+), 29 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 0a7edb441f981..3ed3f462aa530 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -33,6 +33,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
@@ -311,11 +312,6 @@ class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
void setToExitState(SGMapLattice *lattice) override {
(void)lattice->meet(SGMap());
}
-
- LogicalResult initialize(Operation *top) override {
- llvm::errs() << "SGMapPropagation::initialize\n";
- return success();
- }
};
} // namespace
@@ -586,8 +582,8 @@ class RunSGMapPropagation {
public:
RunSGMapPropagation(Operation *op) : target(op) {
SymbolTableCollection symbolTable;
- // solver.load<DeadCodeAnalysis>();
- // solver.load<SparseConstantPropagation>();
+ solver.load<DeadCodeAnalysis>();
+ solver.load<SparseConstantPropagation>();
solver.load<SGMapPropagation>(symbolTable);
(void)solver.initializeAndRun(op);
}
@@ -660,7 +656,7 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
-void attachLayoutAttributeToUsers(Value v, Attribute layout) {
+void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
for (OpOperand &user : v.getUses()) {
Operation *owner = user.getOwner();
unsigned operandNumber = user.getOperandNumber();
@@ -668,11 +664,11 @@ void attachLayoutAttributeToUsers(Value v, Attribute layout) {
/// attribute.
if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
if (operandNumber == 0)
- dpasOp->setAttr("sg_map_a", layout);
+ dpasOp.setSgMapAAttr(layout);
else if (operandNumber == 1)
- dpasOp->setAttr("sg_map_b", layout);
+ dpasOp.setSgMapBAttr(layout);
else if (operandNumber == 2)
- dpasOp->setAttr("sg_map_c", layout);
+ dpasOp.setSgMapCAttr(layout);
continue;
}
/// For every other user, use a generic attribute name.
@@ -686,7 +682,7 @@ attachLayoutAttributes(Operation *top,
llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
llvm::errs() << "op name : " << top->getName() << "\n";
/// Helper to convert SGMap to xegpu::SGMapAttr.
- auto getSGMapForResult = [&](Value r) -> Attribute {
+ auto getSGMapForResult = [&](Value r) -> xegpu::SGMapAttr {
auto layout = getPropagatedLayout(r);
if (!layout.isAssigned())
return {};
@@ -1137,28 +1133,25 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
unsigned operandIdx = operand->getOperandNumber();
- xegpu::SGMapAttr sgMapA =
- mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_a"));
- xegpu::SGMapAttr sgMapB =
- mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_b"));
- xegpu::SGMapAttr sgMapResult =
- mlir::dyn_cast_or_null<xegpu::SGMapAttr>(dpasOp->getAttr("sg_map_c"));
- if (!sgMapA || !sgMapB || !sgMapResult)
+ xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
+ xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
+ xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
+ if (!sgMapA || !sgMapB || !sgMapOut)
return rewriter.notifyMatchFailure(
- dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or result");
+ dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
auto distributedLhsTypeOrFailure =
getDistributedVectorType(sgMapA, dpasOp.getLhsType());
auto distributedRhsTypeOrFailure =
getDistributedVectorType(sgMapB, dpasOp.getRhsType());
auto distributedResultTypeOrFailure =
- getDistributedVectorType(sgMapResult, dpasOp.getResultType());
+ getDistributedVectorType(sgMapOut, dpasOp.getResultType());
if (failed(distributedLhsTypeOrFailure) ||
failed(distributedRhsTypeOrFailure) ||
failed(distributedResultTypeOrFailure))
return rewriter.notifyMatchFailure(
dpasOp,
- "Failed to distribute the A, B or result types in xegpu::Dpas op");
+ "Failed to distribute the A, B or output types in xegpu::Dpas op");
llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
llvm::SmallVector<Type, 3> newYieldTypes{distributedLhsTypeOrFailure.value(),
@@ -1175,15 +1168,15 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
// Create a new dpas op outside the warp op.
rewriter.setInsertionPointAfter(newWarpOp);
- auto newDpasOp = cast<xegpu::DpasOp>(*dpasOp.clone());
- newDpasOp.getLhsMutable().assign(newWarpOp.getResult(newRetIndices[0]));
- newDpasOp.getRhsMutable().assign(newWarpOp.getResult(newRetIndices[1]));
- if (dpasOp.getAcc())
- newDpasOp.getAccMutable().assign(newWarpOp.getResult(newRetIndices[2]));
- newDpasOp->getOpResult(0).setType(distributedResultTypeOrFailure.value());
+ SmallVector<Value> newDpasOperands;
+ for (auto i : newRetIndices) {
+ newDpasOperands.push_back(newWarpOp.getResult(i));
+ }
+ auto newDpasOp = rewriter.create<xegpu::DpasOp>(
+ newWarpOp->getLoc(), distributedResultTypeOrFailure.value(),
+ newDpasOperands, dpasOp->getAttrs());
Value disributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
-
return success();
}
>From 899439bdb4a827b9248c9e163fadf1df312d28b5 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 15:49:43 +0000
Subject: [PATCH 15/53] refine docs
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 34 +--
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 16 +-
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 8 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 72 +++---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 22 +-
mlir/test/Dialect/XeGPU/invalid.mlir | 74 +++---
mlir/test/Dialect/XeGPU/ops.mlir | 222 +++++++++---------
7 files changed, 224 insertions(+), 224 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 8eb1b99c9d2c3..2f9aa0106b1bc 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -154,12 +154,12 @@ def XeGPU_FenceScopeAttr:
let assemblyFormat = "$value";
}
-def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level code
-def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
-def XeGPU_ScopeWI: I32EnumAttrCase<"WI", 2, "wi">; // simt level code
+def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level code
+def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
+def XeGPU_ScopeLane: I32EnumAttrCase<"Lane", 2, "lane">; // simt level code
def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
- [XeGPU_ScopeWG,XeGPU_ScopeSG,XeGPU_ScopeWI]> {
+ [XeGPU_ScopeWG, XeGPU_ScopeSG, XeGPU_ScopeLane]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}
@@ -181,27 +181,27 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
the following parameters.
* scope: Specifies the scope of the current code, which can be either wg (workgroup), sg (subgroup),
- or wi (workitem). It is mandatory for subgroup-level programming and optional for workgroup
+ or lane (workitem). It is mandatory for subgroup-level programming and optional for workgroup
and workitem-level programming. By default, if a LayoutAttr includes sg_layout and sg_data,
- it will be treated as workgroup level. Similarly, if it only includes wi_layout and wi_data,
+ it will be treated as workgroup level. Similarly, if it only includes lane_layout and lane_data,
it will be considered as workitem level.
* sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
* sg_data: [optional] specifies the data size accessed per subgroup.
* order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
The first dimension in the order list is the fastest-changing dimension.
- * wi_layout: [required] specifies the total number of work-items and their layout in a subgroup
- * wi_data: [required] specifies the data size accessed per work-item for a single distribution.
+ * lane_layout: [required] specifies the total number of work-items and their layout in a subgroup
+ * lane_data: [required] specifies the data size accessed per work-item for a single distribution.
- `wi_data[0] * wi_data[1]` can be greater than 1, indicating that each work item operates on multiple
+ `lane_data[0] * lane_data[1]` can be greater than 1, indicating that each work item operates on multiple
elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
- an LLVM vector, or packed into a storage data type. The multiple elements specified by wi_data must
+ an LLVM vector, or packed into a storage data type. The multiple elements specified by lane_data must
come from a single dimension and be contiguous in memory along either dimension.
- E.g., #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
- In this example, the subgroup consists of 16 work items arranged as wi_layout=[1, 16], with
- each work item accessing a single element as defined by wi_data=[1, 1].
+ E.g., #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
+ each work item accessing a single element as defined by lane_data=[1, 1].
- E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+ E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
distributed to 16 work items as described above.
@@ -212,8 +212,8 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
OptionalParameter<"DenseI32ArrayAttr">: $order,
- "DenseI32ArrayAttr": $wi_layout,
- "DenseI32ArrayAttr": $wi_data
+ "DenseI32ArrayAttr": $lane_layout,
+ "DenseI32ArrayAttr": $lane_data
);
let extraClassDeclaration = [{
@@ -230,7 +230,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
bool isForWorkItemLevel() {
if (!getScope())
return !getSgLayout() && !getSgData() && !getOrder();
- return getScope() == ScopeAttr::get(getContext(), Scope::WI);
+ return getScope() == ScopeAttr::get(getContext(), Scope::Lane);
}
}];
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index a3ee6e901a775..7188f74815943 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -113,7 +113,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
%c0 = arith.constant 0 : index
%c1 = arith.constant 8 : index
%1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+ -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
```
}];
@@ -323,7 +323,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32,
- #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
```
@@ -381,7 +381,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
- #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
```
@@ -422,7 +422,7 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
Example 2 (SIMT mode):
```
%2 = xegpu.update_nd_offset %1, [0, 16]:
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
```
}];
@@ -517,7 +517,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
%off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
- #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>
+ #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
```
}];
@@ -653,7 +653,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<uncached>}
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
- !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>>
+ !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
vector<16xi1> -> vector<8x1xf32>
```
@@ -732,7 +732,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
l2_hint = #xegpu.cache_hint<write_back>,
l3_hint = #xegpu.cache_hint<write_through>}
: vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
- !xegpu.layout<wi_layout = [16, 1], wi_data = [1, 1]>> vector<16xi1>
+ !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> vector<16xi1>
```
}];
@@ -790,7 +790,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
%2 = xegpu.update_offset %1, %off :
!xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
- #xegpu.layout<wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
```
}];
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 82d6a4ec39e6b..8559f4beb2c03 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -62,7 +62,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
static-dim-list ::= decimal-literal `x` decimal-literal
attr-list = (, encoding-attr)? (, layout-attr)?
enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
- layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? wi_layout = value, wi_data = value `>`)?
+ layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? lane_layout = value, lane_data = value `>`)?
```
Examples:
@@ -78,13 +78,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
// A TensorDesc with a layout for workgroup level programming
- xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
// A TensorDesc with a layout for subgroup level programming
- xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>>
// A TensorDesc with a layout for workitem level programming
- xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, lane_layout = [1, 16], lane_data = [1, 1]>>
```
}];
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 9557a06e8e2a4..0da86f1af33e4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -74,7 +74,7 @@ LogicalResult
LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
ScopeAttr scope, DenseI32ArrayAttr sg_layout,
DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
- DenseI32ArrayAttr wi_layout, DenseI32ArrayAttr wi_data) {
+ DenseI32ArrayAttr lane_layout, DenseI32ArrayAttr lane_data) {
if (sg_data) {
if (!sg_layout)
@@ -104,8 +104,8 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return emitError() << "expected sg_layout for workgroup level layout";
}
- if (wi_layout.size() != wi_data.size() || wi_layout.size() > 2) {
- return emitError() << "expected wi_layout and wi_data having the same "
+ if (lane_layout.size() != lane_data.size() || lane_layout.size() > 2) {
+ return emitError() << "expected lane_layout and lane_data having the same "
"rank, with a maximum rank of 2";
}
@@ -249,11 +249,11 @@ LogicalResult TensorDescType::verify(
}
if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
- ArrayRef<int32_t> wiLayout = layoutAttr.getWiLayout().asArrayRef();
- ArrayRef<int32_t> wiData = layoutAttr.getWiData().asArrayRef();
+ ArrayRef<int32_t> laneLayout = layoutAttr.getLaneLayout().asArrayRef();
+ ArrayRef<int32_t> laneData = layoutAttr.getLaneData().asArrayRef();
if (rank == 1) {
- if (wiLayout[0] != 1 || wiData[0] != 1)
+ if (laneLayout[0] != 1 || laneData[0] != 1)
return emitError()
<< "outer layout distribution and data mapping must be 1 "
"for 1D tensor";
@@ -265,10 +265,10 @@ LogicalResult TensorDescType::verify(
// [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
// respectively, the mapping should reflect that. This is because each
// work item access data in 32 bit granularity.
- if (wiData[0] != 1)
+ if (laneData[0] != 1)
return emitError()
<< "cannot map over non-contiguous scattered row elements";
- if (wiData[1] != packingFactor)
+ if (laneData[1] != packingFactor)
return emitError() << "work item data mapping must match the number of "
"contiguous elements";
}
@@ -281,10 +281,10 @@ LogicalResult TensorDescType::verify(
size_t dims = tensorShape.size();
for (size_t i = 0; i < dims; ++i) {
- uint32_t numElemPerWi = wiLayout[i] * wiData[i];
+ uint32_t numElemPerWi = laneLayout[i] * laneData[i];
if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0)
return emitError() << "cannot distribute " << tensorShape[i] << " over "
- << wiLayout[i] << " work items with " << wiData[i]
+ << laneLayout[i] << " work items with " << laneData[i]
<< " elements each";
}
}
@@ -295,16 +295,16 @@ LogicalResult TensorDescType::verify(
// If tensor descriptor has a layout attribute it is used in SIMT mode.
// In this mode, the distributed vector shape is determined as follows:
// Definitions:
-// wi_data_size = wi_data[0] × wi_data[1]
-// subgroup_size = wi_layout[0] × wi_layout[1]
-// distribution_unit_size = subgroup_size × wi_data_size
+// lane_data_size = lane_data[0] × lane_data[1]
+// subgroup_size = lane_layout[0] × lane_layout[1]
+// distribution_unit_size = subgroup_size × lane_data_size
// ---------------------------------------------------------------------
// Case 1: Regular loads/stores.
// ---------------------------------------------------------------------
// Distributed vector shape must be:
-// [chunk_size / wi_data_size, wi_data_size]
+// [chunk_size / lane_data_size, lane_data_size]
// If the tensor descriptor shape is 1D, first dimension is ignored (set to 1).
-// [wi_data_size]
+// [lane_data_size]
// ---------------------------------------------------------------------
// Case 2: Block loads/stores
// ---------------------------------------------------------------------
@@ -312,23 +312,23 @@ LogicalResult TensorDescType::verify(
// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
// n_distribution_units = tensor_size / distribution_unit_size
// Given above definitions, the following conditions must be met:
-// * tensor_desc[0] % (wi_layout[0] × wi_data[0]) == 0
-// * tensor_desc[1] % (wi_layout[1] × wi_data[1]) == 0
+// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
+// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
// Distributed vector shape must be:
-// [n_distribution_units, wi_data_size]
+// [n_distribution_units, lane_data_size]
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
// If no layout is provided, tensor desc is not used in SIMT mode.
if (!layout || !layout.isForWorkItemLevel())
return failure();
- SmallVector<int64_t> wiData(layout.getWiData().asArrayRef());
- SmallVector<int64_t> wiLayout(layout.getWiLayout().asArrayRef());
+ SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
+ SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
auto tdescShape = getShape();
- auto wiDataSize = 1, sgSize = 1;
- for (auto [wiDim, wiDataDim] : llvm::zip_equal(wiLayout, wiData)) {
- wiDataSize *= wiDataDim;
+ auto laneDataSize = 1, sgSize = 1;
+ for (auto [wiDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
+ laneDataSize *= laneDataDim;
sgSize *= wiDim;
}
@@ -338,35 +338,35 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto chunkSize = scatterAttr.getChunkSize().getInt();
// Verify if the first dimension of the tensor descriptor shape is
// distributable.
- assert(tdescShape[0] % (wiLayout[0]) == 0 &&
+ assert(tdescShape[0] % (laneLayout[0]) == 0 &&
"tensor descriptor shape is not distributable");
if (chunkSize > 1)
- return VectorType::get({chunkSize / wiDataSize, wiDataSize},
+ return VectorType::get({chunkSize / laneDataSize, laneDataSize},
getElementType());
- return VectorType::get({wiDataSize}, getElementType());
+ return VectorType::get({laneDataSize}, getElementType());
}
// Case 2: block loads/stores
- // Tensor descriptor shape can be 1D. For the 1D case, outer dims of wiData
- // and wiLayout must be 1.
+ // Tensor descriptor shape can be 1D. For the 1D case, outer dims of laneData
+ // and laneLayout must be 1.
if (tdescShape.size() == 1) {
- assert((wiData[0] == 1 && wiLayout[0] == 1) &&
- "wi_data[0] and wi_layout[0] must be 1 for 1D tensor descriptor");
- wiData = {wiData[1]};
- wiLayout = {wiLayout[1]};
+ assert((laneData[0] == 1 && laneLayout[0] == 1) &&
+ "lane_data[0] and lane_layout[0] must be 1 for 1D tensor descriptor");
+ laneData = {laneData[1]};
+ laneLayout = {laneLayout[1]};
}
// Check if the tensor descriptor shape is distributable.
int64_t tensorSize = 1;
- for (auto [tdescDim, wiDim, wiDataDim] :
- llvm::zip_equal(tdescShape, wiLayout, wiData)) {
- assert((tdescDim % (wiDim * wiDataDim) == 0) &&
+ for (auto [tdescDim, wiDim, laneDataDim] :
+ llvm::zip_equal(tdescShape, laneLayout, laneData)) {
+ assert((tdescDim % (wiDim * laneDataDim) == 0) &&
"tensor descriptor shape is not distributable");
tensorSize *= tdescDim;
}
// tensorSize must be adjusted for array_length.
tensorSize *= getArrayLength();
- return VectorType::get({tensorSize / (sgSize * wiDataSize), wiDataSize},
+ return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
getElementType());
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 59faa1d31454d..e2ccc59d39371 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -113,8 +113,8 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
data = attr.getSgData().asArrayRef();
layout = attr.getSgLayout().asArrayRef();
} else {
- data = attr.getWiData().asArrayRef();
- layout = attr.getWiLayout().asArrayRef();
+ data = attr.getLaneData().asArrayRef();
+ layout = attr.getLaneLayout().asArrayRef();
}
for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
// check s % (d * l) != 0
@@ -627,17 +627,17 @@ LogicalResult DpasOp::verify() {
if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
- auto wiLayoutA = aLayout.getWiLayout();
- auto wiLayoutB = bLayout.getWiLayout();
- auto wiLayoutC = cLayout.getWiLayout();
- // Obtain the expanded shapes of the operands and result using wi_layout.
+ auto laneLayoutA = aLayout.getLaneLayout();
+ auto laneLayoutB = bLayout.getLaneLayout();
+ auto laneLayoutC = cLayout.getLaneLayout();
+ // Obtain the expanded shapes of the operands and result using lane_layout.
// NOTE: For B, get rid of the packed dimension for the expanded shape.
- SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
- lhsShape[1] * wiLayoutA[1]};
+ SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
+ lhsShape[1] * laneLayoutA[1]};
SmallVector<int64_t> expandedShapeB = {
- rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
- SmallVector<int64_t> expandedShapeC = {resShape[0] * wiLayoutC[0],
- resShape[1] * wiLayoutC[1]};
+ rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
+ SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
+ resShape[1] * laneLayoutC[1]};
auto bK = expandedShapeB[0];
if (bK != expandedShapeA[1])
return emitOpError("K-dimension mismatch.");
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index c4958d920a89f..17e4f60638905 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -80,11 +80,11 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
// -----
func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<8x2xf32>
return
}
@@ -92,11 +92,11 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
// -----
func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ : !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<8xf32>
return
}
@@ -136,20 +136,20 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
// -----
func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
return
}
// -----
func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
return
}
@@ -248,7 +248,7 @@ func.func @test_prefetch_vc_2(%src: ui64) {
func.func @test_create_tdesc_layout_1(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
return
}
@@ -256,7 +256,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
func.func @test_create_tdesc_layout_2(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [2, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [2, 1]>>
return
}
@@ -264,7 +264,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
func.func @test_create_tdesc_layout_3(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
return
}
@@ -272,9 +272,9 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
func.func @test_load_gather_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
return
}
@@ -282,9 +282,9 @@ func.func @test_load_gather_layout_1(%src: ui64) {
func.func @test_load_gather_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
return
}
@@ -294,9 +294,9 @@ func.func @test_store_scatter_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<1x2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
return
}
@@ -305,9 +305,9 @@ func.func @test_store_scatter_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
return
}
@@ -396,16 +396,16 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
// -----
func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
// expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
return
}
// -----
func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
// expected-error at +1 {{K-dimension mismatch}}
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
- b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
- c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
+ b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
+ c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
return
}
@@ -439,7 +439,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [2, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [2, 16], lane_data = [1, 1]>>
return
}
@@ -447,7 +447,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
return
}
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
return
}
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 1]>>
return
}
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [2, 8], wi_data = [4, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [2, 8], lane_data = [4, 1]>>
return
}
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = wi, wi_layout = [8, 2], wi_data = [1, 2]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 2]>>
return
}
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
!xegpu.tensor_desc<4x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<scope = wi, wi_layout = [1, 1], wi_data = [2, 1]>>
+ #xegpu.layout<scope = lane, lane_layout = [1, 1], lane_data = [2, 1]>>
return
}
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
+ #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
return
}
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<scope = wi, wi_layout = [1, 8], wi_data = [1, 2]>>
+ #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
return
}
@@ -520,22 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected chunk blocks for 2D tensor}}
!xegpu.tensor_desc<16x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.layout<scope = wi, wi_layout = [8, 1], wi_data = [1, 2]>>
+ #xegpu.layout<scope = lane, lane_layout = [8, 1], lane_data = [1, 2]>>
return
}
// -----
func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
// expected-error at +1 {{expected different srcMap and resMap}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>,
- resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>,
+ resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
// -----
func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
// expected-error at +1 {{expected srcMap and resMap be in the same scope}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
- resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 6a29a73a20612..e52562a2f453d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
%2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<4x2xf16>
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
gpu.return
}
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<1x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
gpu.return
}
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<8x1xf32>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<8x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
gpu.return
}
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<2x1xf32>
+ !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
gpu.return
}
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<32x1xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
gpu.return
}
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>> -> vector<16x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
gpu.return
}
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = wi, wi_layout = [16, 1], wi_data = [1, 1]>> -> vector<8x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
%1 = arith.constant dense<1.0>: vector<48x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
%1 = arith.constant dense<1.0>: vector<2x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
gpu.func @test_create_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
gpu.return
}
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
gpu.return
}
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
gpu.return
}
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
gpu.func @test_create_tdesc_simt_3(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
gpu.return
}
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
gpu.return
}
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
gpu.return
}
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
gpu.return
}
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
%2 = arith.constant dense<2.9>: vector<2x1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
%2 = arith.constant dense<2.9>: vector<1x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 2]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
gpu.return
}
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
%2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = wi, wi_layout = [1, 4], wi_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
gpu.func @test_prefetch_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
gpu.return
}
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
gpu.func @test_create_update_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
//CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
%s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = wi, wi_layout = [4, 1], wi_data = [1, 1]>>, vector<4xindex>
+ %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
gpu.return
}
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
- // CHECK: b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
- // CHECK: c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>,
- b_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [2, 1]>,
- c_layout = #xegpu.layout<scope = wi, wi_layout = [1, 16], wi_data = [1, 1]>}
+ // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
+ // CHECK: b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
+ // CHECK: c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
+ b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
+ c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
gpu.return
}
@@ -706,20 +706,20 @@ gpu.func @fence() {
// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
gpu.return
}
gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [2, 1]>,
- resMap = #xegpu.layout<scope = sg, wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [2, 1]>,
+ resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>,
- resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], wi_layout = [1, 16], wi_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+ resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
>From 8636d1562fc1ffc7a7d4365847a3a3dfa18782aa Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 16:21:26 +0000
Subject: [PATCH 16/53] refine docs
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 70 ++++++++++++-------
1 file changed, 45 insertions(+), 25 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 2f9aa0106b1bc..7bb59796af36e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -166,7 +166,11 @@ def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
def XeGPU_ScopeAttr
: EnumAttr<XeGPU_Dialect, XeGPU_ScopeEnums, "Scope"> {
- let summary = [{Describe the programming scope of the IR}];
+ let summary = [{Defines the programming scope of the IR,
+ where WG represents the workgroup level,
+ SG represents the subgroup level, and
+ Lane represents the work-item level}];
+
let assemblyFormat = "``$value";
}
@@ -176,37 +180,53 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
specified by the tensor descriptor.
}];
let description = [{
- XeGPU operations leverages LayoutAttr to distribute data across subgroups and workitems.
- It is specified in tensor_descs upon the tensor description creation. LayoutAttr contains
- the following parameters.
-
- * scope: Specifies the scope of the current code, which can be either wg (workgroup), sg (subgroup),
- or lane (workitem). It is mandatory for subgroup-level programming and optional for workgroup
- and workitem-level programming. By default, if a LayoutAttr includes sg_layout and sg_data,
- it will be treated as workgroup level. Similarly, if it only includes lane_layout and lane_data,
- it will be considered as workitem level.
- * sg_layout: [optional] specifies the total number of subgroups and their layout in a workgroup.
- * sg_data: [optional] specifies the data size accessed per subgroup.
- * order: [optional] specifies the dimension order used to linearize n-d sbugroup ids to 1-d.
- The first dimension in the order list is the fastest-changing dimension.
- * lane_layout: [required] specifies the total number of work-items and their layout in a subgroup
- * lane_data: [required] specifies the data size accessed per work-item for a single distribution.
+ XeGPU operations use `LayoutAttr` to define how data is distributed across subgroups and work-items.
+ This attribute is specified in tensor descriptors during tensor description creation. `LayoutAttr`
+ includes the following parameters, categorized into three groups:
+
+ ### Group 1:
+ * scope: Defines the scope of the code, which can be `wg` (workgroup), `sg` (subgroup),
+ or `lane` (work-item). It is mandatory for subgroup-level programming but optional
+ for workgroup and work-item levels. By default:
+ - If sg_layout is included, the layout is treated as workgroup level.
+ - If only `lane_layout` and `lane_data` are included, it is considered work-item level
+
+ ### Group 2:
+ * sg_layout (optional): Specifies the total number of subgroups and their layout within a workgroup.
+ It is mandatory for workgroup-level programming. Its presence implies workgroup-level code, and
+ the scope must be empty or set to `wg`.
+ * sg_data (optional): Defines the data size accessed per subgroup. It must be used with sg_layout or
+ left empty, in which case it can be derived from `lane_layout` and `lane_data` using the formula:
+ `sg_data[i] = lane_layout[i] * lane_data[i]`.
+ * order (optional): Specifies the dimension order used to linearize n-dimensional sbugroup IDs to
+ 1-dimensional IDs. The first dimension in the order list is the fastest-changing dimension.
+
+ ### Group 3:
+ * lane_layout (required): Specifies the total number of work-items and their layout within a subgroup
+ * lane_data: (required): Specifies the data size accessed per work-item for a single distribution.
`lane_data[0] * lane_data[1]` can be greater than 1, indicating that each work item operates on multiple
elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
an LLVM vector, or packed into a storage data type. The multiple elements specified by lane_data must
come from a single dimension and be contiguous in memory along either dimension.
- E.g., #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
- each work item accessing a single element as defined by lane_data=[1, 1].
-
- E.g., #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
- In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
- arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
- distributed to 16 work items as described above.
-
+ ### Examples:
+ 1. Work-item level layout:
+ ```mlir
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ ```
+ In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
+ each work item accessing a single element as defined by lane_data=[1, 1].
+
+ 2. Workgroup level layout:
+ ```mlir
+ #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+ ```
+ In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
+ arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
+ distributed to 16 work items as described above.
}];
+
let parameters = (ins
OptionalParameter<"ScopeAttr">: $scope,
OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
>From 0190418212529ec164a52f52582c1f77ecbd5c09 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 16:27:02 +0000
Subject: [PATCH 17/53] refine util
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 7bb59796af36e..4afeef1427e8b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -239,7 +239,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let extraClassDeclaration = [{
bool isForWorkgroupLevel() {
if (!getScope())
- return getSgLayout() && getSgData();
+ return getSgLayout() != nullptr;
return getScope() == ScopeAttr::get(getContext(), Scope::WG);
}
>From 32f9272752c48ded0fa51c362fe2ed138614937b Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 24 Mar 2025 17:06:03 +0000
Subject: [PATCH 18/53] refine convert_layout docs
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 7188f74815943..41911ee1aa323 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -984,9 +984,12 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
}
def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
- let summary = "Convert the sg layout of the input operand";
+ let summary = "Convert the layout of the input operand";
let description = [{
- convert_layout remaps the distribution of data across workitems by updating the LayoutAttr.
+ `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
+ the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
+ as workgroup-level (wg) or subgroup-level (sg) code. This operation is not supported for
+ work-item-level code.
}];
let arguments = (ins XeGPU_Vector2DType: $source,
XeGPU_LayoutAttr: $srcMap,
>From fe11c7987a8822afec39f905cf4421496fef7b55 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 24 Mar 2025 18:25:59 +0000
Subject: [PATCH 19/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 30 +++++++++++--------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 3ed3f462aa530..49e0935f88705 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -934,6 +934,16 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
return newVectorType;
}
+/// An operation can be sinked out of WarpExecuteOnLane0 if all ops in its
+/// use-def chain are already sinked.
+static bool canBeSinked(Operation *op) {
+ DenseSet<Operation *> visited;
+ visited.insert(op);
+ while (!visited.empty()) {
+ }
+ return true;
+}
+
LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const {
/// If the function already moved inside a warp_execute_on_lane0, skip.
@@ -1052,17 +1062,13 @@ SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
- /* new yielded types = */ TypeRange{tensorDescTy}, newRetIndices);
+ /* new yielded types = */ tensorDescTy, newRetIndices);
// Create a new load op outside the warp op with the distributed vector type.
rewriter.setInsertionPointAfter(newWarpOp);
auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
- loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
- loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
- loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
-
- newLoadOp.getTensorDescMutable().assign(
- newWarpOp.getResult(newRetIndices[0]));
+ newWarpOp.getLoc(), newVectorType, newWarpOp->getResults()[0],
+ loadOp->getAttrs());
Value distributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
return success();
@@ -1219,13 +1225,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
/// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
/// region.
- GreedyRewriteConfig config;
- config.cseConstants = false;
- config.fold = false;
- (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+ // GreedyRewriteConfig config;
+ // config.cseConstants = false;
+ // config.fold = false;
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns));
/// Finally, do the SIMD to SIMT distribution.
patterns.clear();
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
- (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
>From 6e1ef3ea8324bc07b31f148325426541031604d2 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 24 Mar 2025 21:44:48 +0000
Subject: [PATCH 20/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 52 ++++++++++++++-----
1 file changed, 40 insertions(+), 12 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 49e0935f88705..04ff165bb5313 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -13,9 +13,11 @@
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
@@ -1220,18 +1222,44 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
signalPassFailure();
/// Move all operations inside a GPU functions inside
/// gpu.warp_execute_on_lane0
-
- RewritePatternSet patterns(&getContext());
- patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
- /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
- /// region.
- // GreedyRewriteConfig config;
- // config.cseConstants = false;
- // config.fold = false;
- (void)applyPatternsGreedily(getOperation(), std::move(patterns));
-
+ {
+ RewritePatternSet patterns(&getContext());
+ patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+ /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+ /// region.
+ GreedyRewriteConfig config;
+ config.cseConstants = false;
+ config.fold = false;
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+ }
/// Finally, do the SIMD to SIMT distribution.
- patterns.clear();
+ RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
- (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+ auto distributionFn = [](Value val) {
+ // Create an identity dim map of the same rank as the vector.
+ VectorType vecType = dyn_cast<VectorType>(val.getType());
+ int64_t vecRank = vecType ? vecType.getRank() : 0;
+ OpBuilder builder(val.getContext());
+ if (vecRank == 0)
+ return AffineMap::get(val.getContext());
+ return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+ };
+ auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+ int64_t warpSz) {
+ assert((val.getType().isF32() || val.getType().isInteger(32)) &&
+ "unsupported shuffle type");
+ Type i32Type = builder.getIntegerType(32);
+ Value srcIdxI32 = builder.create<arith::IndexCastOp>(loc, i32Type, srcIdx);
+ Value warpSzI32 = builder.create<arith::ConstantOp>(
+ loc, builder.getIntegerAttr(i32Type, warpSz));
+ Value result = builder
+ .create<gpu::ShuffleOp>(loc, val, srcIdxI32, warpSzI32,
+ gpu::ShuffleMode::IDX)
+ .getResult(0);
+ return result;
+ };
+ vector::populatePropagateWarpVectorDistributionPatterns(
+ patterns, distributionFn, shuffleFn);
+ llvm::errs() << AffineMap::getMultiDimIdentityMap(2, &getContext()) << "\n";
+ // (void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
>From 55c272c367ad296631db90740ed736e1eb7ea1e4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 25 Mar 2025 02:16:03 +0000
Subject: [PATCH 21/53] save work
---
.../Vector/Transforms/VectorDistribution.h | 4 ++++
.../Vector/Transforms/VectorDistribute.cpp | 16 +++++++++++-----
.../lib/Dialect/Vector/TestVectorTransforms.cpp | 2 ++
3 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
index dda45219b2acc..082d990cee8a4 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
@@ -98,6 +98,10 @@ void populatePropagateWarpVectorDistributionPatterns(
const WarpShuffleFromIdxFn &warpShuffleFromIdxFn,
PatternBenefit benefit = 1, PatternBenefit readBenefit = 0);
+/// Patterns for simplification of WarpExecuteOnLane0Op during distribution.
+void populateWarpSimplificationPatterns(RewritePatternSet &pattern,
+ PatternBenefit benefit = 1);
+
/// Lambda signature to compute a reduction of a distributed value for the given
/// reduction kind and size.
using DistributedReductionFn =
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index e214257de2cdf..f0d771142e307 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1761,17 +1761,23 @@ void mlir::vector::populatePropagateWarpVectorDistributionPatterns(
const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit,
PatternBenefit readBenefit) {
patterns.add<WarpOpTransferRead>(patterns.getContext(), readBenefit);
- patterns.add<WarpOpElementwise, WarpOpDeadResult, WarpOpBroadcast,
- WarpOpShapeCast, WarpOpExtract, WarpOpForwardOperand,
- WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
- WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
- patterns.getContext(), benefit);
+ patterns
+ .add<WarpOpElementwise, WarpOpBroadcast, WarpOpShapeCast, WarpOpExtract,
+ WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
+ WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
+ patterns.getContext(), benefit);
patterns.add<WarpOpExtractScalar>(patterns.getContext(), warpShuffleFromIdxFn,
benefit);
patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
benefit);
}
+void mlir::vector::populateWarpSimplificationPatterns(
+ RewritePatternSet &patterns, PatternBenefit benefit) {
+ patterns.add<WarpOpDeadResult, WarpOpForwardOperand>(patterns.getContext(),
+ benefit);
+}
+
void mlir::vector::populateDistributeReduction(
RewritePatternSet &patterns,
const DistributedReductionFn &distributedReductionFn,
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index a54ae816570a8..feec10e6492f7 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -660,6 +660,7 @@ struct TestVectorDistribution
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn, /*benefit=*/1,
/*readBenefit=*/0);
+ vector::populateWarpSimplificationPatterns(patterns);
vector::populateDistributeReduction(patterns, warpReduction, 1);
populateDistributeTransferWriteOpPatterns(patterns, distributionFn, 2);
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
@@ -672,6 +673,7 @@ struct TestVectorDistribution
RewritePatternSet patterns(ctx);
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn);
+ vector::populateWarpSimplificationPatterns(patterns);
vector::populateDistributeReduction(patterns, warpReduction);
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
>From 1ffe5c8e7e988185e5089d05f22fe40d9f267914 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 26 Mar 2025 00:09:27 +0000
Subject: [PATCH 22/53] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 207 ++++++++++--------
1 file changed, 113 insertions(+), 94 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 04ff165bb5313..9252b0ca226ae 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -21,6 +21,7 @@
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeRange.h"
@@ -908,8 +909,9 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
/// | 32x16 | [1, 16] | 32x1 |
/// | 32x16 | [2, 8] | 16x2 |
/// | 2x32x16 | [1, 16] | 2x32x1 |
-FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
- VectorType originalType) {
+FailureOr<VectorType>
+getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
+ VectorType originalType) {
llvm::SmallVector<int64_t, 2> distributedShape;
if (!sgMap)
return failure();
@@ -936,14 +938,30 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::SGMapAttr sgMap,
return newVectorType;
}
-/// An operation can be sinked out of WarpExecuteOnLane0 if all ops in its
-/// use-def chain are already sinked.
-static bool canBeSinked(Operation *op) {
- DenseSet<Operation *> visited;
- visited.insert(op);
- while (!visited.empty()) {
- }
- return true;
+static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
+ VectorType originalType) {
+ auto shape = originalType.getShape();
+ auto distVecTyOrFailure =
+ xegpu::TensorDescType::get(shape, originalType.getElementType(),
+ /*array_length=*/1, /*boundary_check=*/true,
+ /*memory_space=*/xegpu::MemorySpace::Global,
+ sgMap)
+ .getDistributedVectorType();
+ assert(llvm::succeeded(distVecTyOrFailure) &&
+ "Failed to compute distributed vector type for the given vector type");
+ return distVecTyOrFailure.value();
+}
+
+static Value reconcileDistribtedVecType(Value orig, VectorType expected,
+ PatternRewriter &rewriter) {
+ assert(isa<VectorType>(orig.getType()) && "expecting vector type");
+ auto origVecType = cast<VectorType>(orig.getType());
+ /// No need to reconcile if the types are the same.
+ if (origVecType == expected)
+ return orig;
+ auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+ expected, orig);
+ return castOp.getResult(0);
}
LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
@@ -1004,40 +1022,51 @@ SubgroupOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
if (storeOp.getTensorDescType().getShape().size() != 2)
return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
- auto distributedTypeOrFailure =
- getDistributedVectorType(sgMap, storeOp.getValueType());
- if (failed(distributedTypeOrFailure))
+ auto distriburtedTypeByWarpOp =
+ getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
+ if (failed(distriburtedTypeByWarpOp))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
- VectorType newVectorType = distributedTypeOrFailure.value();
+ VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, subgroupOp,
/* new yielded values = */
- ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+ ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
/* new yielded types = */
- TypeRange{storeOp.getTensorDescType(), newVectorType}, newRetIndices);
-
- // Create a new store op outside the warp op with the distributed vector type.
- // Tensor descriptor is not distributed.
+ TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+ newRetIndices);
+ /// Create a new store op outside the warp op with the distributed vector
+ /// type. Tensor descriptor is not distributed.
rewriter.setInsertionPointAfter(newWarpOp);
- auto newStoreOp =
- cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+ SmallVector<Value> newStoreOperands;
+
+ /// For the value operand, there can be a conflict between the vector type
+ /// distributed by the warp op and (xegpu-specific) distributed type supported
+ /// by the store op. We reconcile these mismatches by inserting a cast. These
+ /// gets cancelled out later.
+ auto storeNdDistributedValueTyOrFailure =
+ storeOp.getTensorDescType().getDistributedVectorType();
+ if (failed(storeNdDistributedValueTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ storeOp, "Failed to get distributed vector type for the store op");
+ newStoreOperands.push_back(reconcileDistribtedVecType(
+ newWarpOp.getResult(newRetIndices[0]),
+ storeNdDistributedValueTyOrFailure.value(), rewriter));
+ newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
+
+ rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
+ newStoreOperands, storeOp->getAttrs());
rewriter.eraseOp(storeOp);
- newStoreOp.getTensorDescMutable().assign(
- newWarpOp.getResult(newRetIndices[0]));
- newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
-
return success();
}
LogicalResult
SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const {
- OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
- return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
- });
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
if (!operand)
return rewriter.notifyMatchFailure(subgroupOp,
"warp result is not a xegpu::LoadNd op");
@@ -1049,29 +1078,31 @@ SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
return rewriter.notifyMatchFailure(
loadOp, "the source tensor descriptor lacks sg_map attribute");
- auto tensorDecShape = tensorDescTy.getShape();
- if (tensorDecShape.size() != 2)
- return rewriter.notifyMatchFailure(loadOp,
- "unsupported tensor descriptor shape");
-
- auto distributedTypeOrFailure =
- getDistributedVectorType(sgMap, loadOp.getType());
- if (failed(distributedTypeOrFailure))
- return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
- VectorType newVectorType = distributedTypeOrFailure.value();
-
unsigned operandIdx = operand->getOperandNumber();
+ VectorType distributedTypeByWarpOp =
+ cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
/* new yielded types = */ tensorDescTy, newRetIndices);
- // Create a new load op outside the warp op with the distributed vector type.
+ /// Create a new load op outside the warp op with the distributed vector type.
rewriter.setInsertionPointAfter(newWarpOp);
- auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
- newWarpOp.getLoc(), newVectorType, newWarpOp->getResults()[0],
- loadOp->getAttrs());
+ auto loadNdDistValueTyOrFailure =
+ loadOp.getTensorDescType().getDistributedVectorType();
+ if (failed(loadNdDistValueTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ loadOp, "Failed to get distributed vector type for the load op");
+ Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+ newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+ newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
Value distributedVal = newWarpOp.getResult(operandIdx);
+ /// There can be a conflict between the vector type distributed by the warp op
+ /// and (xegpu-specific) distributed type supported by the load op. We
+ /// reconcile these mismatches by inserting a cast.
+ newLoadOp =
+ reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
return success();
}
@@ -1079,10 +1110,8 @@ SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
LogicalResult
SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const {
- OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
- return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
- });
-
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
if (!operand)
return rewriter.notifyMatchFailure(
subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
@@ -1131,10 +1160,7 @@ SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
LogicalResult
SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const {
- OpOperand *operand = getWarpResult(subgroupOp, [](Operation *op) {
- return isa<xegpu::DpasOp>(op) && op->hasOneUse();
- });
-
+ OpOperand *operand = getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
if (!operand)
return rewriter.notifyMatchFailure(subgroupOp,
"warp result is not a xegpu::Dpas op");
@@ -1148,28 +1174,29 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
return rewriter.notifyMatchFailure(
dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
- auto distributedLhsTypeOrFailure =
- getDistributedVectorType(sgMapA, dpasOp.getLhsType());
- auto distributedRhsTypeOrFailure =
- getDistributedVectorType(sgMapB, dpasOp.getRhsType());
- auto distributedResultTypeOrFailure =
- getDistributedVectorType(sgMapOut, dpasOp.getResultType());
- if (failed(distributedLhsTypeOrFailure) ||
- failed(distributedRhsTypeOrFailure) ||
- failed(distributedResultTypeOrFailure))
+ auto distLhsTypeByWarpOpOrFailure =
+ getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
+ auto distRhsTypeByWarpOpOrFailure =
+ getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
+ auto distResultTypeByWarpOpOrFailure =
+ getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
+ if (failed(distLhsTypeByWarpOpOrFailure) ||
+ failed(distRhsTypeByWarpOpOrFailure) ||
+ failed(distResultTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(
dpasOp,
"Failed to distribute the A, B or output types in xegpu::Dpas op");
llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
- llvm::SmallVector<Type, 3> newYieldTypes{distributedLhsTypeOrFailure.value(),
- distributedRhsTypeOrFailure.value()};
- // Dpas acc operand is optional.
+ llvm::SmallVector<Type, 3> newYieldTypes{
+ distLhsTypeByWarpOpOrFailure.value(),
+ distRhsTypeByWarpOpOrFailure.value()};
+ /// Dpas acc operand is optional.
if (dpasOp.getAcc()) {
newYieldValues.push_back(dpasOp.getAcc());
- newYieldTypes.push_back(distributedResultTypeOrFailure.value());
+ newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
}
- // Create a new warp op without the dpas.
+ /// Create a new warp op without the dpas.
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
@@ -1177,13 +1204,30 @@ SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
// Create a new dpas op outside the warp op.
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newDpasOperands;
+ SmallVector<VectorType> newDpasOperandExpectedTypes;
+ /// Reconcile the distributed types with the original types.
+ newDpasOperandExpectedTypes.push_back(
+ getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
+ newDpasOperandExpectedTypes.push_back(
+ getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
+ if (dpasOp.getAcc()) {
+ newDpasOperandExpectedTypes.push_back(
+ getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
+ }
+
for (auto i : newRetIndices) {
- newDpasOperands.push_back(newWarpOp.getResult(i));
+ newDpasOperands.push_back(reconcileDistribtedVecType(
+ newWarpOp.getResult(i),
+ newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
}
auto newDpasOp = rewriter.create<xegpu::DpasOp>(
- newWarpOp->getLoc(), distributedResultTypeOrFailure.value(),
+ newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
newDpasOperands, dpasOp->getAttrs());
Value disributedVal = newWarpOp.getResult(operandIdx);
+ /// Reconile the output type.
+ disributedVal = reconcileDistribtedVecType(
+ disributedVal, getDistributedVectorType(sgMapOut, dpasOp.getResultType()),
+ rewriter);
rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
return success();
}
@@ -1235,31 +1279,6 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
/// Finally, do the SIMD to SIMT distribution.
RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
- auto distributionFn = [](Value val) {
- // Create an identity dim map of the same rank as the vector.
- VectorType vecType = dyn_cast<VectorType>(val.getType());
- int64_t vecRank = vecType ? vecType.getRank() : 0;
- OpBuilder builder(val.getContext());
- if (vecRank == 0)
- return AffineMap::get(val.getContext());
- return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
- };
- auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
- int64_t warpSz) {
- assert((val.getType().isF32() || val.getType().isInteger(32)) &&
- "unsupported shuffle type");
- Type i32Type = builder.getIntegerType(32);
- Value srcIdxI32 = builder.create<arith::IndexCastOp>(loc, i32Type, srcIdx);
- Value warpSzI32 = builder.create<arith::ConstantOp>(
- loc, builder.getIntegerAttr(i32Type, warpSz));
- Value result = builder
- .create<gpu::ShuffleOp>(loc, val, srcIdxI32, warpSzI32,
- gpu::ShuffleMode::IDX)
- .getResult(0);
- return result;
- };
- vector::populatePropagateWarpVectorDistributionPatterns(
- patterns, distributionFn, shuffleFn);
- llvm::errs() << AffineMap::getMultiDimIdentityMap(2, &getContext()) << "\n";
- // (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+ vector::populateWarpSimplificationPatterns(patterns);
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
>From e5521f93b89ea344a659e8d294ba45023cc34227 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 27 Mar 2025 19:28:51 +0000
Subject: [PATCH 23/53] save work before merging with Chao's PR
---
.../Vector/Transforms/VectorDistribution.h | 4 -
.../Vector/Transforms/VectorDistribute.cpp | 16 +-
.../Transforms/XeGPUSubgroupDistribute.cpp | 690 +++++++++---------
.../Dialect/Vector/TestVectorTransforms.cpp | 2 -
4 files changed, 348 insertions(+), 364 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
index 082d990cee8a4..dda45219b2acc 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
@@ -98,10 +98,6 @@ void populatePropagateWarpVectorDistributionPatterns(
const WarpShuffleFromIdxFn &warpShuffleFromIdxFn,
PatternBenefit benefit = 1, PatternBenefit readBenefit = 0);
-/// Patterns for simplification of WarpExecuteOnLane0Op during distribution.
-void populateWarpSimplificationPatterns(RewritePatternSet &pattern,
- PatternBenefit benefit = 1);
-
/// Lambda signature to compute a reduction of a distributed value for the given
/// reduction kind and size.
using DistributedReductionFn =
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index f0d771142e307..e214257de2cdf 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1761,23 +1761,17 @@ void mlir::vector::populatePropagateWarpVectorDistributionPatterns(
const WarpShuffleFromIdxFn &warpShuffleFromIdxFn, PatternBenefit benefit,
PatternBenefit readBenefit) {
patterns.add<WarpOpTransferRead>(patterns.getContext(), readBenefit);
- patterns
- .add<WarpOpElementwise, WarpOpBroadcast, WarpOpShapeCast, WarpOpExtract,
- WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
- WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
- patterns.getContext(), benefit);
+ patterns.add<WarpOpElementwise, WarpOpDeadResult, WarpOpBroadcast,
+ WarpOpShapeCast, WarpOpExtract, WarpOpForwardOperand,
+ WarpOpConstant, WarpOpExtractElement, WarpOpInsertElement,
+ WarpOpInsertScalar, WarpOpInsert, WarpOpCreateMask>(
+ patterns.getContext(), benefit);
patterns.add<WarpOpExtractScalar>(patterns.getContext(), warpShuffleFromIdxFn,
benefit);
patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
benefit);
}
-void mlir::vector::populateWarpSimplificationPatterns(
- RewritePatternSet &patterns, PatternBenefit benefit) {
- patterns.add<WarpOpDeadResult, WarpOpForwardOperand>(patterns.getContext(),
- benefit);
-}
-
void mlir::vector::populateDistributeReduction(
RewritePatternSet &patterns,
const DistributedReductionFn &distributedReductionFn,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9252b0ca226ae..38d9fe6c88800 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -683,7 +683,6 @@ void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
static LogicalResult
attachLayoutAttributes(Operation *top,
llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
- llvm::errs() << "op name : " << top->getName() << "\n";
/// Helper to convert SGMap to xegpu::SGMapAttr.
auto getSGMapForResult = [&](Value r) -> xegpu::SGMapAttr {
auto layout = getPropagatedLayout(r);
@@ -759,6 +758,71 @@ namespace {
/// SIMT Distribution Patterns
///===----------------------------------------------------------------------===///
+/// Returns the distributed vector type for a source vector type according to
+/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
+/// corresponding wi_layout dimension. If array_length > 1, that is appended to
+/// the front of the disributed shape.
+/// Examples:
+/// | original vector shape | wi_layout | distributed vector shape |
+/// |-----------------------|-----------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+FailureOr<VectorType>
+getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
+ VectorType originalType) {
+ llvm::SmallVector<int64_t, 2> distributedShape;
+ if (!sgMap)
+ return failure();
+
+ auto wiLayout = sgMap.getWiLayout();
+ assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
+ "expecting 2D or 3D shape for the original vector type");
+ assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
+ // Original type can be 2D or 3D (array_length > 1), the last two dims are the
+ // block shape.
+ auto blockShape = originalType.getShape().take_back(2);
+ // Check if the block vector shape can be distributed evenly.
+ if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
+ return failure();
+
+ if (originalType.getRank() == 3) {
+ distributedShape.push_back(originalType.getShape()[0]);
+ }
+ for (unsigned i = 0; i < 2; ++i) {
+ distributedShape.push_back(blockShape[i] / wiLayout[i]);
+ }
+ auto newVectorType =
+ VectorType::get(distributedShape, originalType.getElementType());
+ return newVectorType;
+}
+
+static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
+ VectorType originalType) {
+ auto shape = originalType.getShape();
+ auto distVecTyOrFailure =
+ xegpu::TensorDescType::get(shape, originalType.getElementType(),
+ /*array_length=*/1, /*boundary_check=*/true,
+ /*memory_space=*/xegpu::MemorySpace::Global,
+ sgMap)
+ .getDistributedVectorType();
+ assert(llvm::succeeded(distVecTyOrFailure) &&
+ "Failed to compute distributed vector type for the given vector type");
+ return distVecTyOrFailure.value();
+}
+
+static Value reconcileDistribtedVecType(Value orig, VectorType expected,
+ PatternRewriter &rewriter) {
+ assert(isa<VectorType>(orig.getType()) && "expecting vector type");
+ auto origVecType = cast<VectorType>(orig.getType());
+ /// No need to reconcile if the types are the same.
+ if (origVecType == expected)
+ return orig;
+ auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+ expected, orig);
+ return castOp.getResult(0);
+}
+
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
/// contained within a WarpExecuteOnLane0Op.
@@ -786,7 +850,48 @@ struct MoveFuncBodyToWarpExecuteOnLane0
: public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
- PatternRewriter &rewriter) const override;
+ PatternRewriter &rewriter) const override {
+ /// If the function only contains a single void return, skip.
+ if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+ }))
+ return failure();
+ /// If the function already moved inside a warp_execute_on_lane0, skip.
+ if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::WarpExecuteOnLane0Op>(op);
+ }))
+ return failure();
+ /// Create a new function with the same signature.
+ auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+ gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+ /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+ /// original gpuFuncOp.
+ rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+ auto laneId = rewriter.create<gpu::LaneIdOp>(
+ newGpuFunc.getLoc(), rewriter.getIndexType(),
+ /** upperBound = **/ mlir::IntegerAttr());
+ auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+ auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+ laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+ newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+ auto &warpBodyBlock = warpOp.getBodyRegion().front();
+ /// Replace the ReturnOp of the original gpu function with a YieldOp.
+ auto origRetunOp =
+ cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+ rewriter.setInsertionPointAfter(origRetunOp);
+ rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+ origRetunOp.getOperands());
+ rewriter.eraseOp(origRetunOp);
+ /// Move the original function body to the WarpExecuteOnLane0Op body.
+ rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+ warpOp.getBodyRegion().begin());
+ rewriter.eraseBlock(&warpBodyBlock);
+ /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+ rewriter.setInsertionPointAfter(warpOp);
+ rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+ rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+ return success();
+ }
};
/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
@@ -823,7 +928,53 @@ struct MoveFuncBodyToWarpExecuteOnLane0
struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const override;
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+ auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+
+ auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+ if (!srcTypedVal)
+ return rewriter.notifyMatchFailure(
+ descOp, "expecting a memref typed value as the source");
+
+ auto descOffsets = descOp.getMixedOffsets();
+
+ xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ descOp, "the tensor descriptor lacks sg_map attribute");
+
+ SmallVector<size_t> newRetIndices;
+ SmallVector<Value> newYieldValues;
+ SmallVector<Type> newYieldTypes;
+
+ for (auto arg : descOp->getOperands()) {
+ newYieldValues.push_back(arg);
+ newYieldTypes.push_back(arg.getType());
+ }
+ rewriter.setInsertionPoint(subgroupOp);
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+ /* new yielded types = */ newYieldTypes, newRetIndices);
+
+ SmallVector<Value> newDescOperands;
+ for (auto i : newRetIndices) {
+ newDescOperands.push_back(newWarpOp.getResult(i));
+ }
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+ newWarpOp.getLoc(), descOp.getType(), newDescOperands,
+ descOp->getAttrs());
+
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+ return success();
+ }
};
/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
@@ -853,7 +1004,62 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const override;
+ PatternRewriter &rewriter) const override {
+ auto yield = cast<gpu::YieldOp>(
+ subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+ if (!storeOp)
+ return failure();
+
+ auto tensorDescTy = storeOp.getTensorDescType();
+ xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+ if (storeOp.getTensorDescType().getShape().size() != 2)
+ return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+
+ auto distriburtedTypeByWarpOp =
+ getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
+ if (failed(distriburtedTypeByWarpOp))
+ return rewriter.notifyMatchFailure(storeOp,
+ "Failed to distribute the type");
+ VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp,
+ /* new yielded values = */
+ ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
+ /* new yielded types = */
+ TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+ newRetIndices);
+ /// Create a new store op outside the warp op with the distributed vector
+ /// type. Tensor descriptor is not distributed.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ SmallVector<Value> newStoreOperands;
+
+ /// For the value operand, there can be a conflict between the vector type
+ /// distributed by the warp op and (xegpu-specific) distributed type
+ /// supported by the store op. We reconcile these mismatches by inserting a
+ /// cast. These gets cancelled out later.
+ auto storeNdDistributedValueTyOrFailure =
+ storeOp.getTensorDescType().getDistributedVectorType();
+ if (failed(storeNdDistributedValueTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ storeOp, "Failed to get distributed vector type for the store op");
+ newStoreOperands.push_back(reconcileDistribtedVecType(
+ newWarpOp.getResult(newRetIndices[0]),
+ storeNdDistributedValueTyOrFailure.value(), rewriter));
+ newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
+
+ rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
+ newStoreOperands, storeOp->getAttrs());
+ rewriter.eraseOp(storeOp);
+ return success();
+ }
};
/// Clone a load_nd feeding into vector.yield op for the enclosing
@@ -888,349 +1094,132 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const override;
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::LoadNd op");
+
+ auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+ xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+ xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+ unsigned operandIdx = operand->getOperandNumber();
+ VectorType distributedTypeByWarpOp =
+ cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+ /* new yielded types = */ tensorDescTy, newRetIndices);
+
+ /// Create a new load op outside the warp op with the distributed vector
+ /// type.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto loadNdDistValueTyOrFailure =
+ loadOp.getTensorDescType().getDistributedVectorType();
+ if (failed(loadNdDistValueTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ loadOp, "Failed to get distributed vector type for the load op");
+ Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+ newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+ newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ /// There can be a conflict between the vector type distributed by the warp
+ /// op and (xegpu-specific) distributed type supported by the load op. We
+ /// reconcile these mismatches by inserting a cast.
+ newLoadOp = reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp,
+ rewriter);
+ rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+ return success();
+ }
};
struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const override;
-};
-
-} // namespace
-
-/// Returns the distributed vector type for a source vector type according to
-/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
-/// corresponding wi_layout dimension. If array_length > 1, that is appended to
-/// the front of the disributed shape.
-/// Examples:
-/// | original vector shape | wi_layout | distributed vector shape |
-/// |-----------------------|-----------|--------------------------|
-/// | 32x16 | [1, 16] | 32x1 |
-/// | 32x16 | [2, 8] | 16x2 |
-/// | 2x32x16 | [1, 16] | 2x32x1 |
-FailureOr<VectorType>
-getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
- VectorType originalType) {
- llvm::SmallVector<int64_t, 2> distributedShape;
- if (!sgMap)
- return failure();
-
- auto wiLayout = sgMap.getWiLayout();
- assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
- "expecting 2D or 3D shape for the original vector type");
- assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
- // Original type can be 2D or 3D (array_length > 1), the last two dims are the
- // block shape.
- auto blockShape = originalType.getShape().take_back(2);
- // Check if the block vector shape can be distributed evenly.
- if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
- return failure();
-
- if (originalType.getRank() == 3) {
- distributedShape.push_back(originalType.getShape()[0]);
- }
- for (unsigned i = 0; i < 2; ++i) {
- distributedShape.push_back(blockShape[i] / wiLayout[i]);
- }
- auto newVectorType =
- VectorType::get(distributedShape, originalType.getElementType());
- return newVectorType;
-}
-
-static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
- VectorType originalType) {
- auto shape = originalType.getShape();
- auto distVecTyOrFailure =
- xegpu::TensorDescType::get(shape, originalType.getElementType(),
- /*array_length=*/1, /*boundary_check=*/true,
- /*memory_space=*/xegpu::MemorySpace::Global,
- sgMap)
- .getDistributedVectorType();
- assert(llvm::succeeded(distVecTyOrFailure) &&
- "Failed to compute distributed vector type for the given vector type");
- return distVecTyOrFailure.value();
-}
-
-static Value reconcileDistribtedVecType(Value orig, VectorType expected,
- PatternRewriter &rewriter) {
- assert(isa<VectorType>(orig.getType()) && "expecting vector type");
- auto origVecType = cast<VectorType>(orig.getType());
- /// No need to reconcile if the types are the same.
- if (origVecType == expected)
- return orig;
- auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
- expected, orig);
- return castOp.getResult(0);
-}
-
-LogicalResult MoveFuncBodyToWarpExecuteOnLane0::matchAndRewrite(
- gpu::GPUFuncOp gpuFuncOp, PatternRewriter &rewriter) const {
- /// If the function already moved inside a warp_execute_on_lane0, skip.
- if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
- return isa<gpu::WarpExecuteOnLane0Op>(op);
- }))
- return failure();
- /// Create a new function with the same signature.
- auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
- gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
- /// Create a WarpExecuteOnLane0Op with same arguments and results as the
- /// original gpuFuncOp.
- rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
- auto laneId = rewriter.create<gpu::LaneIdOp>(
- newGpuFunc.getLoc(), rewriter.getIndexType(),
- /** upperBound = **/ mlir::IntegerAttr());
- auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
- auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
- laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
- newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
- auto &warpBodyBlock = warpOp.getBodyRegion().front();
- /// Replace the ReturnOp of the original gpu function with a YieldOp.
- auto origRetunOp =
- cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
- rewriter.setInsertionPointAfter(origRetunOp);
- rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
- origRetunOp.getOperands());
- rewriter.eraseOp(origRetunOp);
- /// Move the original function body to the WarpExecuteOnLane0Op body.
- rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
- warpOp.getBodyRegion().begin());
- rewriter.eraseBlock(&warpBodyBlock);
- /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
- rewriter.setInsertionPointAfter(warpOp);
- rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
- rewriter.replaceOp(gpuFuncOp, newGpuFunc);
- return success();
-}
-
-LogicalResult
-SubgroupOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const {
- auto yield = cast<gpu::YieldOp>(
- subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
- Operation *lastNode = yield->getPrevNode();
- auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
- if (!storeOp)
- return failure();
-
- auto tensorDescTy = storeOp.getTensorDescType();
- xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
- if (!sgMap)
- return rewriter.notifyMatchFailure(
- storeOp, "the source tensor descriptor lacks sg_map attribute");
-
- if (storeOp.getTensorDescType().getShape().size() != 2)
- return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-
- auto distriburtedTypeByWarpOp =
- getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
- if (failed(distriburtedTypeByWarpOp))
- return rewriter.notifyMatchFailure(storeOp,
- "Failed to distribute the type");
- VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
-
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, subgroupOp,
- /* new yielded values = */
- ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
- /* new yielded types = */
- TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
- newRetIndices);
- /// Create a new store op outside the warp op with the distributed vector
- /// type. Tensor descriptor is not distributed.
- rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newStoreOperands;
-
- /// For the value operand, there can be a conflict between the vector type
- /// distributed by the warp op and (xegpu-specific) distributed type supported
- /// by the store op. We reconcile these mismatches by inserting a cast. These
- /// gets cancelled out later.
- auto storeNdDistributedValueTyOrFailure =
- storeOp.getTensorDescType().getDistributedVectorType();
- if (failed(storeNdDistributedValueTyOrFailure))
- return rewriter.notifyMatchFailure(
- storeOp, "Failed to get distributed vector type for the store op");
- newStoreOperands.push_back(reconcileDistribtedVecType(
- newWarpOp.getResult(newRetIndices[0]),
- storeNdDistributedValueTyOrFailure.value(), rewriter));
- newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
-
- rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
- newStoreOperands, storeOp->getAttrs());
- rewriter.eraseOp(storeOp);
- return success();
-}
-
-LogicalResult
-SubgroupOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const {
- OpOperand *operand =
- getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(subgroupOp,
- "warp result is not a xegpu::LoadNd op");
-
- auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
- xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
- xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
- if (!sgMap)
- return rewriter.notifyMatchFailure(
- loadOp, "the source tensor descriptor lacks sg_map attribute");
-
- unsigned operandIdx = operand->getOperandNumber();
- VectorType distributedTypeByWarpOp =
- cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
-
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
- /* new yielded types = */ tensorDescTy, newRetIndices);
-
- /// Create a new load op outside the warp op with the distributed vector type.
- rewriter.setInsertionPointAfter(newWarpOp);
- auto loadNdDistValueTyOrFailure =
- loadOp.getTensorDescType().getDistributedVectorType();
- if (failed(loadNdDistValueTyOrFailure))
- return rewriter.notifyMatchFailure(
- loadOp, "Failed to get distributed vector type for the load op");
- Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
- newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
- newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
- Value distributedVal = newWarpOp.getResult(operandIdx);
- /// There can be a conflict between the vector type distributed by the warp op
- /// and (xegpu-specific) distributed type supported by the load op. We
- /// reconcile these mismatches by inserting a cast.
- newLoadOp =
- reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
- rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
- return success();
-}
-
-LogicalResult
-SubgroupOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const {
- OpOperand *operand =
- getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
- auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
- unsigned operandIdx = operand->getOperandNumber();
-
- auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
- if (!srcTypedVal)
- return rewriter.notifyMatchFailure(
- descOp, "expecting a memref typed value as the source");
-
- auto descOffsets = descOp.getMixedOffsets();
-
- xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
- if (!sgMap)
- return rewriter.notifyMatchFailure(
- descOp, "the tensor descriptor lacks sg_map attribute");
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldValues;
- SmallVector<Type> newYieldTypes;
-
- for (auto arg : descOp->getOperands()) {
- newYieldValues.push_back(arg);
- newYieldTypes.push_back(arg.getType());
- }
- rewriter.setInsertionPoint(subgroupOp);
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
- /* new yielded types = */ newYieldTypes, newRetIndices);
-
- SmallVector<Value> newDescOperands;
- for (auto i : newRetIndices) {
- newDescOperands.push_back(newWarpOp.getResult(i));
- }
- rewriter.setInsertionPointAfter(newWarpOp);
- auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
- newWarpOp.getLoc(), descOp.getType(), newDescOperands,
- descOp->getAttrs());
-
- Value distributedVal = newWarpOp.getResult(operandIdx);
- rewriter.replaceAllUsesWith(distributedVal, newDescOp);
- return success();
-}
-
-LogicalResult
-SubgroupOpDpas::matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
- PatternRewriter &rewriter) const {
- OpOperand *operand = getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(subgroupOp,
- "warp result is not a xegpu::Dpas op");
-
- auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
- unsigned operandIdx = operand->getOperandNumber();
- xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
- xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
- xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
- if (!sgMapA || !sgMapB || !sgMapOut)
- return rewriter.notifyMatchFailure(
- dpasOp, "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
-
- auto distLhsTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
- auto distRhsTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
- auto distResultTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
- if (failed(distLhsTypeByWarpOpOrFailure) ||
- failed(distRhsTypeByWarpOpOrFailure) ||
- failed(distResultTypeByWarpOpOrFailure))
- return rewriter.notifyMatchFailure(
- dpasOp,
- "Failed to distribute the A, B or output types in xegpu::Dpas op");
-
- llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(), dpasOp.getRhs()};
- llvm::SmallVector<Type, 3> newYieldTypes{
- distLhsTypeByWarpOpOrFailure.value(),
- distRhsTypeByWarpOpOrFailure.value()};
- /// Dpas acc operand is optional.
- if (dpasOp.getAcc()) {
- newYieldValues.push_back(dpasOp.getAcc());
- newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
- }
- /// Create a new warp op without the dpas.
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
-
- // Create a new dpas op outside the warp op.
- rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newDpasOperands;
- SmallVector<VectorType> newDpasOperandExpectedTypes;
- /// Reconcile the distributed types with the original types.
- newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
- newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
- if (dpasOp.getAcc()) {
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(subgroupOp,
+ "warp result is not a xegpu::Dpas op");
+
+ auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+ xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
+ xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
+ xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
+ if (!sgMapA || !sgMapB || !sgMapOut)
+ return rewriter.notifyMatchFailure(
+ dpasOp,
+ "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
+
+ auto distLhsTypeByWarpOpOrFailure =
+ getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
+ auto distRhsTypeByWarpOpOrFailure =
+ getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
+ auto distResultTypeByWarpOpOrFailure =
+ getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
+ if (failed(distLhsTypeByWarpOpOrFailure) ||
+ failed(distRhsTypeByWarpOpOrFailure) ||
+ failed(distResultTypeByWarpOpOrFailure))
+ return rewriter.notifyMatchFailure(
+ dpasOp,
+ "Failed to distribute the A, B or output types in xegpu::Dpas op");
+
+ llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
+ dpasOp.getRhs()};
+ llvm::SmallVector<Type, 3> newYieldTypes{
+ distLhsTypeByWarpOpOrFailure.value(),
+ distRhsTypeByWarpOpOrFailure.value()};
+ /// Dpas acc operand is optional.
+ if (dpasOp.getAcc()) {
+ newYieldValues.push_back(dpasOp.getAcc());
+ newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
+ }
+ /// Create a new warp op without the dpas.
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+
+ // Create a new dpas op outside the warp op.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ SmallVector<Value> newDpasOperands;
+ SmallVector<VectorType> newDpasOperandExpectedTypes;
+ /// Reconcile the distributed types with the original types.
newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
- }
+ getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
+ newDpasOperandExpectedTypes.push_back(
+ getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
+ if (dpasOp.getAcc()) {
+ newDpasOperandExpectedTypes.push_back(
+ getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
+ }
- for (auto i : newRetIndices) {
- newDpasOperands.push_back(reconcileDistribtedVecType(
- newWarpOp.getResult(i),
- newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
+ for (auto i : newRetIndices) {
+ newDpasOperands.push_back(reconcileDistribtedVecType(
+ newWarpOp.getResult(i),
+ newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
+ }
+ auto newDpasOp = rewriter.create<xegpu::DpasOp>(
+ newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
+ newDpasOperands, dpasOp->getAttrs());
+ Value disributedVal = newWarpOp.getResult(operandIdx);
+ /// Reconile the output type.
+ disributedVal = reconcileDistribtedVecType(
+ disributedVal,
+ getDistributedVectorType(sgMapOut, dpasOp.getResultType()), rewriter);
+ rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
+ return success();
}
- auto newDpasOp = rewriter.create<xegpu::DpasOp>(
- newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
- newDpasOperands, dpasOp->getAttrs());
- Value disributedVal = newWarpOp.getResult(operandIdx);
- /// Reconile the output type.
- disributedVal = reconcileDistribtedVecType(
- disributedVal, getDistributedVectorType(sgMapOut, dpasOp.getResultType()),
- rewriter);
- rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
- return success();
-}
+};
+
+} // namespace
namespace {
struct XeGPUSubgroupDistributePass final
@@ -1265,20 +1254,27 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
if (failed(resolveLayoutConflicts(getOperation())))
signalPassFailure();
/// Move all operations inside a GPU functions inside
- /// gpu.warp_execute_on_lane0
+ /// gpu.warp_execute_on_lane0.
+ /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
+ /// region.
+ // GreedyRewriteConfig config;
+ // config.cseConstants = false;
+ // config.fold = false;
+ // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
{
RewritePatternSet patterns(&getContext());
patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
- /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
- /// region.
- GreedyRewriteConfig config;
- config.cseConstants = false;
- config.fold = false;
- (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+
+ (void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
/// Finally, do the SIMD to SIMT distribution.
RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
- vector::populateWarpSimplificationPatterns(patterns);
+ /// TODO: These are not used at this point.
+ auto distributionFn = [](Value val) { return AffineMap(); };
+ auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+ int64_t warpSz) { return Value(); };
+ vector::populatePropagateWarpVectorDistributionPatterns(
+ patterns, distributionFn, shuffleFn);
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index feec10e6492f7..a54ae816570a8 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -660,7 +660,6 @@ struct TestVectorDistribution
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn, /*benefit=*/1,
/*readBenefit=*/0);
- vector::populateWarpSimplificationPatterns(patterns);
vector::populateDistributeReduction(patterns, warpReduction, 1);
populateDistributeTransferWriteOpPatterns(patterns, distributionFn, 2);
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
@@ -673,7 +672,6 @@ struct TestVectorDistribution
RewritePatternSet patterns(ctx);
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn);
- vector::populateWarpSimplificationPatterns(patterns);
vector::populateDistributeReduction(patterns, warpReduction);
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
}
>From 5700c8149354b94e05d3570de0dcea32d51039c4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 29 Mar 2025 03:43:26 +0000
Subject: [PATCH 24/53] merge xegpu changes
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 7 ++
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 11 ++++
.../Transforms/XeGPUSubgroupDistribute.cpp | 65 ++++++++++---------
3 files changed, 51 insertions(+), 32 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 4afeef1427e8b..2baf34550dc38 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -254,6 +254,13 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
}
}];
+ let builders = [
+ AttrBuilder<(ins
+ "ArrayRef<int>": $lane_layout,
+ "ArrayRef<int>": $lane_data
+ )>
+ ];
+
let assemblyFormat = "`<` struct(params) `>`";
let genVerifyDecl = 1;
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 6ef1a2deebcab..946a3961aa5c1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,7 +8,9 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
namespace mlir {
@@ -113,6 +115,15 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return success();
}
+LayoutAttr LayoutAttr::get(mlir::MLIRContext *context, ArrayRef<int> laneLayout,
+ ArrayRef<int> laneData) {
+ return Base::get(context, ScopeAttr::get(context, Scope::Lane),
+ DenseI32ArrayAttr(), DenseI32ArrayAttr(),
+ DenseI32ArrayAttr(),
+ DenseI32ArrayAttr::get(context, laneLayout),
+ DenseI32ArrayAttr::get(context, laneData));
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescType
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 38d9fe6c88800..e2d8b6b06c513 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -659,7 +659,7 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
-void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
+void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
for (OpOperand &user : v.getUses()) {
Operation *owner = user.getOwner();
unsigned operandNumber = user.getOperandNumber();
@@ -667,11 +667,11 @@ void attachLayoutAttributeToUsers(Value v, xegpu::SGMapAttr layout) {
/// attribute.
if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
if (operandNumber == 0)
- dpasOp.setSgMapAAttr(layout);
+ dpasOp.setALayoutAttr(layout);
else if (operandNumber == 1)
- dpasOp.setSgMapBAttr(layout);
+ dpasOp.setBLayoutAttr(layout);
else if (operandNumber == 2)
- dpasOp.setSgMapCAttr(layout);
+ dpasOp.setCLayoutAttr(layout);
continue;
}
/// For every other user, use a generic attribute name.
@@ -684,17 +684,17 @@ static LogicalResult
attachLayoutAttributes(Operation *top,
llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
/// Helper to convert SGMap to xegpu::SGMapAttr.
- auto getSGMapForResult = [&](Value r) -> xegpu::SGMapAttr {
+ auto getSGMapForResult = [&](Value r) -> xegpu::LayoutAttr {
auto layout = getPropagatedLayout(r);
if (!layout.isAssigned())
return {};
- SmallVector<uint32_t, 2> wiLayout, wiData;
+ SmallVector<int, 2> wiLayout, wiData;
for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
layout.getDataAsArrayRef())) {
- wiLayout.push_back(static_cast<uint32_t>(layout));
- wiData.push_back(static_cast<uint32_t>(data));
+ wiLayout.push_back(static_cast<int>(layout));
+ wiData.push_back(static_cast<int>(data));
}
- return xegpu::SGMapAttr::get(top->getContext(), wiLayout, wiData);
+ return xegpu::LayoutAttr::get(r.getContext(), wiLayout, wiData);
};
/// Attach the layout attributes to the results of the operations.
auto walkResult = top->walk([&](Operation *op) {
@@ -769,13 +769,13 @@ namespace {
/// | 32x16 | [2, 8] | 16x2 |
/// | 2x32x16 | [1, 16] | 2x32x1 |
FailureOr<VectorType>
-getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
+getDistributedVecTypeBasedOnWiLayout(xegpu::LayoutAttr layout,
VectorType originalType) {
llvm::SmallVector<int64_t, 2> distributedShape;
- if (!sgMap)
+ if (!layout)
return failure();
- auto wiLayout = sgMap.getWiLayout();
+ auto wiLayout = layout.getLaneLayout();
assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
"expecting 2D or 3D shape for the original vector type");
assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
@@ -797,14 +797,14 @@ getDistributedVecTypeBasedOnWiLayout(xegpu::SGMapAttr sgMap,
return newVectorType;
}
-static VectorType getDistributedVectorType(xegpu::SGMapAttr sgMap,
+static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
VectorType originalType) {
auto shape = originalType.getShape();
auto distVecTyOrFailure =
xegpu::TensorDescType::get(shape, originalType.getElementType(),
/*array_length=*/1, /*boundary_check=*/true,
/*memory_space=*/xegpu::MemorySpace::Global,
- sgMap)
+ layout)
.getDistributedVectorType();
assert(llvm::succeeded(distVecTyOrFailure) &&
"Failed to compute distributed vector type for the given vector type");
@@ -944,8 +944,8 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
auto descOffsets = descOp.getMixedOffsets();
- xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
- if (!sgMap)
+ xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+ if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks sg_map attribute");
@@ -1013,8 +1013,8 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
return failure();
auto tensorDescTy = storeOp.getTensorDescType();
- xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
- if (!sgMap)
+ xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+ if (!layout)
return rewriter.notifyMatchFailure(
storeOp, "the source tensor descriptor lacks sg_map attribute");
@@ -1022,7 +1022,7 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
auto distriburtedTypeByWarpOp =
- getDistributedVecTypeBasedOnWiLayout(sgMap, storeOp.getValueType());
+ getDistributedVecTypeBasedOnWiLayout(layout, storeOp.getValueType());
if (failed(distriburtedTypeByWarpOp))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
@@ -1103,8 +1103,8 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
- xegpu::SGMapAttr sgMap = tensorDescTy.getSGMapAttr();
- if (!sgMap)
+ xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+ if (!layout)
return rewriter.notifyMatchFailure(
loadOp, "the source tensor descriptor lacks sg_map attribute");
@@ -1151,20 +1151,21 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
unsigned operandIdx = operand->getOperandNumber();
- xegpu::SGMapAttr sgMapA = dpasOp.getSgMapAAttr();
- xegpu::SGMapAttr sgMapB = dpasOp.getSgMapBAttr();
- xegpu::SGMapAttr sgMapOut = dpasOp->getAttrOfType<xegpu::SGMapAttr>("r0");
- if (!sgMapA || !sgMapB || !sgMapOut)
+ xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
+ xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+ xegpu::LayoutAttr layoutOut =
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>("r0");
+ if (!layoutA || !layoutB || !layoutOut)
return rewriter.notifyMatchFailure(
dpasOp,
"the xegpu::Dpas op lacks sg_map attribute for A, B or output");
auto distLhsTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(sgMapA, dpasOp.getLhsType());
+ getDistributedVecTypeBasedOnWiLayout(layoutA, dpasOp.getLhsType());
auto distRhsTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(sgMapB, dpasOp.getRhsType());
+ getDistributedVecTypeBasedOnWiLayout(layoutB, dpasOp.getRhsType());
auto distResultTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(sgMapOut, dpasOp.getResultType());
+ getDistributedVecTypeBasedOnWiLayout(layoutOut, dpasOp.getResultType());
if (failed(distLhsTypeByWarpOpOrFailure) ||
failed(distRhsTypeByWarpOpOrFailure) ||
failed(distResultTypeByWarpOpOrFailure))
@@ -1193,12 +1194,12 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
SmallVector<VectorType> newDpasOperandExpectedTypes;
/// Reconcile the distributed types with the original types.
newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(sgMapA, dpasOp.getLhsType()));
+ getDistributedVectorType(layoutA, dpasOp.getLhsType()));
newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(sgMapB, dpasOp.getRhsType()));
+ getDistributedVectorType(layoutB, dpasOp.getRhsType()));
if (dpasOp.getAcc()) {
newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(sgMapOut, dpasOp.getResultType()));
+ getDistributedVectorType(layoutOut, dpasOp.getResultType()));
}
for (auto i : newRetIndices) {
@@ -1213,7 +1214,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
/// Reconile the output type.
disributedVal = reconcileDistribtedVecType(
disributedVal,
- getDistributedVectorType(sgMapOut, dpasOp.getResultType()), rewriter);
+ getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
return success();
}
>From 2334a9780b5ce1b129053f0080f1d37e4ae4a6a7 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 31 Mar 2025 19:26:21 +0000
Subject: [PATCH 25/53] refactor names
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 434 +++++++++---------
1 file changed, 223 insertions(+), 211 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index e2d8b6b06c513..8e1e846c94d3e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -55,7 +55,7 @@ using namespace mlir::dataflow;
/// HW dependent constants.
/// TODO: These constants should be queried from the target information.
-constexpr unsigned subgroupSize = 16; // How many work items in a subgroup.
+constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
/// If DPAS A or B operands have low precision element types they must be packed
/// according to the following sizes.
constexpr unsigned packedSizeInBitsForDefault =
@@ -69,8 +69,8 @@ namespace {
/// Layout
///===----------------------------------------------------------------------===///
-/// Helper class to store the ND layout of work items within a subgroup and data
-/// owned by each work item.
+/// Helper class to store the ND layout of lanes within a subgroup and data
+/// owned by each lane.
struct Layout {
SmallVector<int64_t, 3> layout;
Layout() = default;
@@ -91,123 +91,125 @@ int64_t Layout::operator[](size_t idx) const {
return layout[idx];
}
-/// WiLayout represents the layout of work items within a subgroup when it
-/// accesses some value. WiData represents the layout of data owned by each work
-/// item.
-using WiLayout = Layout;
-using WiData = Layout;
+/// LaneLayout represents the logical layout of lanes within a subgroup when it
+/// accesses some value. LaneData represents the logical layout of data owned by
+/// each work item.
+using LaneLayout = Layout;
+using LaneData = Layout;
///===----------------------------------------------------------------------===///
-/// SGMap
+/// LayoutInfo
///===----------------------------------------------------------------------===///
/// Helper class for tracking the analysis state of a value. For SGPropagation,
-/// the analysis state is simply the wi_layout and wi_data of each value.
+/// the analysis state is simply the lane_layout and lane_data of each value.
/// Purpose of this analysis to propagate some unique layout for each value in
/// the program starting from some known values (like DPAS, StoreNd, etc.).
///
-/// Given this, SGMap satisifies the following properties:
-/// 1) SGMap is a lattice with two states - assigned and not assigned.
-/// 2) Two SGMap values are equal if they are both assigned or both not
+/// Given this, LayoutInfo satisifies the following properties:
+/// 1) LayoutInfo is a lattice with two states - assigned and not assigned.
+/// 2) Two LayoutInfo values are equal if they are both assigned or both not
/// assigned. The concrete value of assigned state does not matter.
/// 3) The meet operator works as follows:
/// - If current state is assigned, return the current state. (already
/// a unique layout is assigned. don't change it)
/// - Otherwise, return the other state.
-struct SGMap {
+struct LayoutInfo {
private:
- WiLayout wiLayout;
- WiData wiData;
+ LaneLayout laneLayout;
+ LaneData laneData;
public:
- SGMap() = default;
- SGMap(const WiLayout &layout, const WiData &data)
- : wiLayout(layout), wiData(data) {}
+ LayoutInfo() = default;
+ LayoutInfo(const LaneLayout &layout, const LaneData &data)
+ : laneLayout(layout), laneData(data) {}
/// Two lattice values are equal if they have `some` layout. The actual
/// content of the layout does not matter.
- bool operator==(const SGMap &other) const {
+ bool operator==(const LayoutInfo &other) const {
return this->isAssigned() == other.isAssigned();
}
- static SGMap meet(const SGMap &lhs, const SGMap &rhs);
+ static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
- static SGMap join(const SGMap &lhs, const SGMap &rhs);
+ static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
void print(raw_ostream &os) const;
- bool isAssigned() const { return wiLayout.size() > 0 && wiData.size() > 0; }
+ bool isAssigned() const {
+ return laneLayout.size() > 0 && laneData.size() > 0;
+ }
- SGMap getTransposedLayout(ArrayRef<int64_t> permutation) const;
+ LayoutInfo getTransposedLayout(ArrayRef<int64_t> permutation) const;
- const WiLayout &getLayout() const { return wiLayout; }
- const WiData &getData() const { return wiData; }
- ArrayRef<int64_t> getLayoutAsArrayRef() const { return wiLayout.layout; }
- ArrayRef<int64_t> getDataAsArrayRef() const { return wiData.layout; }
+ const LaneLayout &getLayout() const { return laneLayout; }
+ const LaneData &getData() const { return laneData; }
+ ArrayRef<int64_t> getLayoutAsArrayRef() const { return laneLayout.layout; }
+ ArrayRef<int64_t> getDataAsArrayRef() const { return laneData.layout; }
};
-void SGMap::print(raw_ostream &os) const {
+void LayoutInfo::print(raw_ostream &os) const {
if (isAssigned()) {
- os << "wi_layout: ";
- wiLayout.print(os);
- os << ", wi_data: ";
- wiData.print(os);
+ os << "lane_layout: ";
+ laneLayout.print(os);
+ os << ", lane_data: ";
+ laneData.print(os);
} else
os << "Not assigned.";
}
-SGMap SGMap::meet(const SGMap &lhs, const SGMap &rhs) {
+LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
if (!lhs.isAssigned())
return rhs;
return lhs;
}
/// Since this is a backward analysis, join method is not used.
-SGMap SGMap::join(const SGMap &lhs, const SGMap &rhs) {
+LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
llvm_unreachable("Join should not be triggered by SGMapPropagation.");
}
/// Get the transposed layout according to the given permutation.
-SGMap SGMap::getTransposedLayout(ArrayRef<int64_t> permutation) const {
+LayoutInfo
+LayoutInfo::getTransposedLayout(ArrayRef<int64_t> permutation) const {
if (!isAssigned())
return {};
- WiLayout newLayout;
- WiData newData;
+ LaneLayout newLayout;
+ LaneData newData;
for (auto idx : permutation) {
- newLayout.layout.push_back(wiLayout.layout[idx]);
- newData.layout.push_back(wiData.layout[idx]);
+ newLayout.layout.push_back(laneLayout.layout[idx]);
+ newData.layout.push_back(laneData.layout[idx]);
}
- return SGMap(newLayout, newData);
+ return LayoutInfo(newLayout, newData);
}
///===----------------------------------------------------------------------===///
-/// SGMapLattice
+/// LayoutInfoLattice
///===----------------------------------------------------------------------===///
-/// Lattice holding the SGMap for each value.
-struct SGMapLattice : public Lattice<SGMap> {
- MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SGMapLattice)
+/// Lattice holding the LayoutInfo for each value.
+struct LayoutInfoLattice : public Lattice<LayoutInfo> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
using Lattice::Lattice;
};
/// Helper Functions to get default layouts. A `default layout` is a layout that
/// is assigned to a value when the layout is not fixed by some anchor operation
-/// (like DPAS). This is the natural layout work items are arranged in a
-/// subgroup.
+/// (like DPAS).
/// Helper Function to get the default layout for uniform values like constants.
-/// For 1D vector, wi_layout is [subgroupSize] and wi_data is [1].
-/// For 2D vector, wi_layout is [1, subgroupSize] and wi_data is [1, 1].
-static SGMap getDefaultSgMap(unsigned rank) {
+/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
+/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
+static LayoutInfo getDefaultLayoutInfo(unsigned rank) {
assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
if (rank == 1)
- return SGMap(WiLayout({subgroupSize}), WiData({1}));
- return SGMap(WiLayout({1, subgroupSize}), WiData({1, 1}));
+ return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1}));
+ return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1}));
}
/// Helper to get the default layout for a vector type.
-static SGMap getDefaultSgMap(VectorType vectorTy) {
+static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) {
/// Expecting a 1D or 2D vector.
assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
"Expected 1D or 2D vector.");
@@ -216,112 +218,119 @@ static SGMap getDefaultSgMap(VectorType vectorTy) {
"Expected int or float element type.");
/// If the rank is 1, then return default layout for 1D vector.
if (vectorTy.getRank() == 1)
- return getDefaultSgMap(1);
+ return getDefaultLayoutInfo(1);
/// Packing factor is determined by the element type bitwidth.
int packingFactor = 1;
auto bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
if (bitwidth < packedSizeInBitsForDefault)
packingFactor = packedSizeInBitsForDefault / bitwidth;
- return SGMap(WiLayout({1, subgroupSize}), WiData({1, packingFactor}));
+ return LayoutInfo(LaneLayout({1, subgroupSize}),
+ LaneData({1, packingFactor}));
}
-/// Helper Function to get the expected layouts for DPAS operands. `wi_data` is
-/// set according to the following criteria:
+/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
+/// is set according to the following criteria:
/// * For A operand, the data must be packed in minimum
/// `packedSizeInBitsForDefault`
/// * For B operand, the data must be packed in minimum
/// `packedSizeInBitsForDpasB`
-static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
+static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy,
+ unsigned operandNum) {
auto elementTy = vectorTy.getElementType();
assert(elementTy.isIntOrFloat() &&
"Expected int or float type in DPAS operands");
- WiLayout layout({1, subgroupSize});
+ LaneLayout layout({1, subgroupSize});
/// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
/// must have the VNNI format.
if (operandNum == 1 &&
elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) {
- WiData data(
+ LaneData data(
{packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1});
- return SGMap(layout, data);
+ return LayoutInfo(layout, data);
}
/// Otherwise, return the default layout for the vector type.
- return getDefaultSgMap(vectorTy);
+ return getDefaultLayoutInfo(vectorTy);
}
///===----------------------------------------------------------------------===///
-/// SGMapPropagation
+/// LayoutInfoPropagation
///===----------------------------------------------------------------------===///
-/// Backward data flow analysis to propagate the wi_layout and wi_data of each
-/// value in the program. Currently, the layouts for operands DPAS, StoreNd, and
-/// StoreScatter are fixed (known before propagation). Purpose of this analysis
-/// is to propagate those known layouts to all their producers and (other)
-/// consumers.
-class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
+/// Backward data flow analysis to propagate the lane_layout and lane_data of
+/// each value in the program. Currently, the layouts for operands DPAS,
+/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
+/// this analysis is to propagate those known layouts to all their producers and
+/// (other) consumers.
+class LayoutInfoPropagation
+ : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
private:
- void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
- void visitStoreNdOp(xegpu::StoreNdOp store, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ void visitStoreNdOp(xegpu::StoreNdOp store,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
- void visitLoadNdOp(xegpu::LoadNdOp load, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ void visitLoadNdOp(xegpu::LoadNdOp load,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitLoadGatherOp(xegpu::LoadGatherOp load,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitTransposeOp(vector::TransposeOp transpose,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitVectorBitcastOp(vector::BitCastOp bitcast,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitCreateDescOp(xegpu::CreateDescOp createDesc,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
public:
- SGMapPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable)
+ LayoutInfoPropagation(DataFlowSolver &solver,
+ SymbolTableCollection &symbolTable)
: SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
- LogicalResult visitOperation(Operation *op, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) override;
+ LogicalResult
+ visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) override;
void visitBranchOperand(OpOperand &operand) override {};
void visitCallOperand(OpOperand &operand) override {};
void visitExternalCall(CallOpInterface call,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) override {};
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) override {
+ };
- void setToExitState(SGMapLattice *lattice) override {
- (void)lattice->meet(SGMap());
+ void setToExitState(LayoutInfoLattice *lattice) override {
+ (void)lattice->meet(LayoutInfo());
}
};
} // namespace
-LogicalResult
-SGMapPropagation::visitOperation(Operation *op,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+LogicalResult LayoutInfoPropagation::visitOperation(
+ Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
TypeSwitch<Operation *>(op)
.Case<xegpu::DpasOp>(
[&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); })
@@ -355,8 +364,8 @@ SGMapPropagation::visitOperation(Operation *op,
})
/// All other ops.
.Default([&](Operation *op) {
- for (const SGMapLattice *r : results) {
- for (SGMapLattice *operand : operands) {
+ for (const LayoutInfoLattice *r : results) {
+ for (LayoutInfoLattice *operand : operands) {
/// Propagate the layout of the result to the operand.
if (r->getValue().isAssigned())
meet(operand, *r);
@@ -364,15 +373,16 @@ SGMapPropagation::visitOperation(Operation *op,
}
});
/// Add a dependency from each result to program point after the operation.
- for (const SGMapLattice *r : results) {
- addDependency(const_cast<SGMapLattice *>(r), getProgramPointAfter(op));
+ for (const LayoutInfoLattice *r : results) {
+ addDependency(const_cast<LayoutInfoLattice *>(r), getProgramPointAfter(op));
}
return success();
}
-void SGMapPropagation::visitVectorMultiReductionOp(
- vector::MultiDimReductionOp reduction, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitVectorMultiReductionOp(
+ vector::MultiDimReductionOp reduction,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
/// The layout of the result must be present.
auto resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
@@ -382,7 +392,7 @@ void SGMapPropagation::visitVectorMultiReductionOp(
"Expected 1D layout for reduction result.");
/// Given that the result is 1D, the layout of the operand should be 2D with
/// default layout.
- auto operandLayout = getDefaultSgMap(2);
+ auto operandLayout = getDefaultLayoutInfo(2);
propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
/// Accumulator should have the same layout as the result.
propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -390,9 +400,10 @@ void SGMapPropagation::visitVectorMultiReductionOp(
/// Propagate the layout of the result tensor to the source tensor descriptor in
/// UpdateNdOffsetOp.
-void SGMapPropagation::visitUpdateNdOffsetOp(
- xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitUpdateNdOffsetOp(
+ xegpu::UpdateNdOffsetOp updateNdOffset,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
/// The layout of the result must be present.
auto resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
@@ -402,48 +413,48 @@ void SGMapPropagation::visitUpdateNdOffsetOp(
}
/// Set the layouts for DPAS A, B, and C operands.
-void SGMapPropagation::visitDpasOp(xegpu::DpasOp dpas,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitDpasOp(
+ xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
auto aTy = dpas.getLhsType();
auto bTy = dpas.getRhsType();
propagateIfChanged(operands[0],
- operands[0]->meet(getSGMapForDPASOperand(aTy, 0)));
+ operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0)));
propagateIfChanged(operands[1],
- operands[1]->meet(getSGMapForDPASOperand(bTy, 1)));
+ operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1)));
if (operands.size() > 2) {
auto cTy = dpas.getAccType();
propagateIfChanged(operands[2],
- operands[2]->meet(getSGMapForDPASOperand(cTy, 2)));
+ operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2)));
}
}
/// Set the layout for the value and tensor descriptor operands in StoreNdOp.
-void SGMapPropagation::visitStoreNdOp(xegpu::StoreNdOp store,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- auto storeLayout = getDefaultSgMap(store.getValueType());
+void LayoutInfoPropagation::visitStoreNdOp(
+ xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ auto storeLayout = getDefaultLayoutInfo(store.getValueType());
/// Both operands should have the same layout
- for (SGMapLattice *operand : operands) {
+ for (LayoutInfoLattice *operand : operands) {
propagateIfChanged(operand, operand->meet(storeLayout));
}
}
/// Propagate the layout of the value to the tensor descriptor operand in
/// LoadNdOp.
-void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitLoadNdOp(
+ xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
auto valueLayout = results[0]->getValue();
/// Need the layout of the value to propagate to the tensor descriptor.
if (!valueLayout.isAssigned())
return;
- SGMap tensorDescLayout = valueLayout;
+ LayoutInfo tensorDescLayout = valueLayout;
/// LoadNdOp has the transpose effect. However, at the stage of this analysis
/// this effect is not expected and should be abstracted away. Emit a warning.
if (auto transpose = load.getTranspose()) {
load.emitWarning("Transpose effect is not expected for LoadNdOp at "
- "SGMapPropagation stage.");
+ "LayoutInfoPropagation stage.");
tensorDescLayout = valueLayout.getTransposedLayout(transpose.value());
}
/// Propagate the new layout to the tensor descriptor operand.
@@ -452,9 +463,9 @@ void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load,
/// For vector::TransposeOp, the layout of the result is transposed and
/// propagated to the operand.
-void SGMapPropagation::visitTransposeOp(
- vector::TransposeOp transpose, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitTransposeOp(
+ vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
/// Need the layout of transpose result to propagate to the operands.
auto resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
@@ -464,11 +475,11 @@ void SGMapPropagation::visitTransposeOp(
propagateIfChanged(operands[0], operands[0]->meet(newLayout));
}
-/// For vector::BitCastOp, the wi_data of the source layout is changed based on
-/// the bit width of the source and result types.
-void SGMapPropagation::visitVectorBitcastOp(
- vector::BitCastOp bitcast, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+/// For vector::BitCastOp, the lane_data of the source layout is changed based
+/// on the bit width of the source and result types.
+void LayoutInfoPropagation::visitVectorBitcastOp(
+ vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
/// Need the layout of bitcast result to propagate to the operands.
auto resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
@@ -478,49 +489,49 @@ void SGMapPropagation::visitVectorBitcastOp(
auto outElemTyBitWidth =
bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
- /// WiLayout does not change.
- const WiLayout &newWiLayout = resultLayout.getLayout();
- const WiData &currData = resultLayout.getData();
- WiData newWiData;
+ /// LaneLayout does not change.
+ const LaneLayout &newLaneLayout = resultLayout.getLayout();
+ const LaneData &currData = resultLayout.getData();
+ LaneData newLaneData;
/// It's a widening bitcast
if (inElemTyBitWidth < outElemTyBitWidth) {
auto ratio = outElemTyBitWidth / inElemTyBitWidth;
- newWiData = resultLayout.getData()[0] == 1
- ? WiData({1, currData[1] * ratio})
- : WiData({currData[0] * ratio, 1});
+ newLaneData = resultLayout.getData()[0] == 1
+ ? LaneData({1, currData[1] * ratio})
+ : LaneData({currData[0] * ratio, 1});
} else {
/// It's a narrowing bitcast
auto ratio = inElemTyBitWidth / outElemTyBitWidth;
- newWiData = resultLayout.getData()[0] == 1
- ? WiData({1, currData[1] / ratio})
- : WiData({currData[0] / ratio, 1});
+ newLaneData = resultLayout.getData()[0] == 1
+ ? LaneData({1, currData[1] / ratio})
+ : LaneData({currData[0] / ratio, 1});
}
propagateIfChanged(operands[0],
- operands[0]->meet(SGMap(newWiLayout, newWiData)));
+ operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData)));
}
/// Propagate the layout of the result to the tensor descriptor and mask
/// operands in LoadGatherOp.
-void SGMapPropagation::visitLoadGatherOp(
- xegpu::LoadGatherOp load, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitLoadGatherOp(
+ xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
auto valueLayout = results[0]->getValue();
/// Need the layout of the value to propagate to the tensor descriptor.
if (!valueLayout.isAssigned())
return;
- SGMap tensorDescLayout = valueLayout;
+ LayoutInfo tensorDescLayout = valueLayout;
if (load.getTranspose()) {
/// LoadGatherOp has the transpose effect. However, at the stage of this
/// analyis this effect is not expected and should be abstracted away. Emit
/// a warning.
load.emitWarning("Transpose effect is not expected for LoadGatherOp at "
- "SGMapPropagation stage.");
+ "LayoutInfoPropagation stage.");
tensorDescLayout = valueLayout.getTransposedLayout({1, 0});
}
/// Mask operand should have 1D default layout.
- auto maskLayout = getDefaultSgMap(1);
+ auto maskLayout = getDefaultLayoutInfo(1);
/// Propagate the new layout to the tensor descriptor operand.
propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
/// Propagate the new layout to the mask operand.
@@ -529,23 +540,23 @@ void SGMapPropagation::visitLoadGatherOp(
/// Propagate the layout of the descriptor to the vector offset operand in
/// CreateDescOp.
-void SGMapPropagation::visitCreateDescOp(
- xegpu::CreateDescOp createDesc, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitCreateDescOp(
+ xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
auto descLayout = results[0]->getValue();
/// Need the layout of the descriptor to propagate to the operands.
if (!descLayout.isAssigned())
return;
/// For offset operand propagate 1D default layout.
- SGMap layout = getDefaultSgMap(1);
+ LayoutInfo layout = getDefaultLayoutInfo(1);
propagateIfChanged(operands[1], operands[1]->meet(layout));
}
/// Set the layout for the value, tensor descriptor, and mask operands in the
/// StoreScatterOp.
-void SGMapPropagation::visitStoreScatterOp(
- xegpu::StoreScatterOp storeScatter, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+void LayoutInfoPropagation::visitStoreScatterOp(
+ xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
/// Currently, for 2D StoreScatterOp we expect that the height dimension of
/// the tensor descriptor is evenly divisible by the subgroup size.
/// TODO: Add support for other 2D shapes.
@@ -555,14 +566,14 @@ void SGMapPropagation::visitStoreScatterOp(
"be evenly divisible by the subgroup size.");
return;
}
- auto valueLayout = getDefaultSgMap(storeScatter.getValueType());
- SGMap storeScatterLayout = valueLayout;
+ auto valueLayout = getDefaultLayoutInfo(storeScatter.getValueType());
+ LayoutInfo storeScatterLayout = valueLayout;
if (storeScatter.getTranspose()) {
/// StoreScatteOp allows transpose effect. However, at the stage of this
/// analyis this effect is not expected and should be abstracted away. Emit
/// a warning.
storeScatter.emitWarning("Transpose effect is not expected for "
- "StoreScatterOp at SGMapPropagation stage.");
+ "StoreScatterOp at LayoutInfoPropagation stage.");
storeScatterLayout = valueLayout.getTransposedLayout({1, 0});
}
/// Propagate the value layout.
@@ -570,28 +581,28 @@ void SGMapPropagation::visitStoreScatterOp(
/// Propagate the tensor descriptor layout.
propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout));
/// Use default 1D layout for mask operand.
- auto maskLayout = getDefaultSgMap(1);
+ auto maskLayout = getDefaultLayoutInfo(1);
propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
}
namespace {
///===----------------------------------------------------------------------===///
-/// RunSGMapPropagation
+/// RunLayoutInfoPropagation
///===----------------------------------------------------------------------===///
-/// Driver class for running the SGMapPropagation analysis.
-class RunSGMapPropagation {
+/// Driver class for running the LayoutInfoPropagation analysis.
+class RunLayoutInfoPropagation {
public:
- RunSGMapPropagation(Operation *op) : target(op) {
+ RunLayoutInfoPropagation(Operation *op) : target(op) {
SymbolTableCollection symbolTable;
solver.load<DeadCodeAnalysis>();
solver.load<SparseConstantPropagation>();
- solver.load<SGMapPropagation>(symbolTable);
+ solver.load<LayoutInfoPropagation>(symbolTable);
(void)solver.initializeAndRun(op);
}
- SGMap getSGMap(Value val);
+ LayoutInfo getLayoutInfo(Value val);
void printAnalysisResult(llvm::raw_ostream &os);
@@ -601,21 +612,21 @@ class RunSGMapPropagation {
};
} // namespace
-SGMap RunSGMapPropagation::getSGMap(Value val) {
- auto *state = solver.lookupState<SGMapLattice>(val);
+LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
+ auto *state = solver.lookupState<LayoutInfoLattice>(val);
if (!state)
return {};
return state->getValue();
}
-void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
+void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
auto printFunctionResult = [&](FunctionOpInterface funcOp) {
os << "function: " << funcOp.getName() << ":\n";
// Function arguments
for (auto arg : funcOp.getArguments()) {
- auto layout = getSGMap(arg);
+ auto layout = getLayoutInfo(arg);
os << "argument: " << arg << "\n";
- os << "sg_map : ";
+ os << "layout : ";
layout.print(os);
os << "\n";
}
@@ -631,10 +642,10 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
else
op->print(os);
os << "\n";
- /// Print the sg_map for each result.
+ /// Print the layout for each result.
for (auto [i, r] : llvm::enumerate(op->getResults())) {
- auto layout = getSGMap(r);
- os << "sg_map for result #" << i << ": ";
+ auto layout = getLayoutInfo(r);
+ os << "layout for result #" << i << ": ";
layout.print(os);
os << "\n";
}
@@ -663,8 +674,7 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
for (OpOperand &user : v.getUses()) {
Operation *owner = user.getOwner();
unsigned operandNumber = user.getOperandNumber();
- /// If the user is a DpasOp, set "sg_map_a", "sg_map_b", or "sg_map_c"
- /// attribute.
+ /// If the user is a DpasOp, set A, B or C layout attributes.
if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
if (operandNumber == 0)
dpasOp.setALayoutAttr(layout);
@@ -680,11 +690,10 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
}
}
-static LogicalResult
-attachLayoutAttributes(Operation *top,
- llvm::function_ref<SGMap(Value)> getPropagatedLayout) {
- /// Helper to convert SGMap to xegpu::SGMapAttr.
- auto getSGMapForResult = [&](Value r) -> xegpu::LayoutAttr {
+static LogicalResult attachLayoutAttributes(
+ Operation *top, llvm::function_ref<LayoutInfo(Value)> getPropagatedLayout) {
+ /// Helper to convert the layout info to the xegpu::LayoutAttr.
+ auto getLayoutInfoForResult = [&](Value r) -> xegpu::LayoutAttr {
auto layout = getPropagatedLayout(r);
if (!layout.isAssigned())
return {};
@@ -701,9 +710,9 @@ attachLayoutAttributes(Operation *top,
/// For function ops, propagate the argument layout to the users.
if (auto func = dyn_cast<FunctionOpInterface>(op)) {
for (auto arg : func.getArguments()) {
- auto sgMapAttr = getSGMapForResult(arg);
- if (sgMapAttr) {
- attachLayoutAttributeToUsers(arg, sgMapAttr);
+ auto layoutInfo = getLayoutInfoForResult(arg);
+ if (layoutInfo) {
+ attachLayoutAttributeToUsers(arg, layoutInfo);
}
}
return WalkResult::advance();
@@ -713,8 +722,8 @@ attachLayoutAttributes(Operation *top,
return WalkResult::advance();
if (auto tensorDescTy =
dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
- auto sgMapAttr = getSGMapForResult(op->getResult(0));
- if (!sgMapAttr) {
+ auto layoutInfo = getLayoutInfoForResult(op->getResult(0));
+ if (!layoutInfo) {
LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
return WalkResult::interrupt();
}
@@ -725,7 +734,8 @@ attachLayoutAttributes(Operation *top,
auto *newOp = builder.clone(*op);
auto newTensorDescTy = xegpu::TensorDescType::get(
tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(), sgMapAttr);
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
+ layoutInfo);
newOp->getResult(0).setType(newTensorDescTy);
op->replaceAllUsesWith(newOp->getResults());
op->erase();
@@ -733,12 +743,12 @@ attachLayoutAttributes(Operation *top,
}
/// Otherwise simply attach the sg_map to the op itself.
for (auto [i, r] : llvm::enumerate(op->getResults())) {
- auto sgMapAttr = getSGMapForResult(r);
- if (sgMapAttr) {
+ auto layoutInfo = getLayoutInfoForResult(r);
+ if (layoutInfo) {
auto attrName = "r" + std::to_string(i);
- op->setAttr(attrName, sgMapAttr);
+ op->setAttr(attrName, layoutInfo);
/// Attach the layout attribute to the users of the result.
- attachLayoutAttributeToUsers(r, sgMapAttr);
+ attachLayoutAttributeToUsers(r, layoutInfo);
}
}
return WalkResult::advance();
@@ -759,18 +769,18 @@ namespace {
///===----------------------------------------------------------------------===///
/// Returns the distributed vector type for a source vector type according to
-/// the wi_layout. We simply divide each dimension of tensor descriptor shape by
-/// corresponding wi_layout dimension. If array_length > 1, that is appended to
-/// the front of the disributed shape.
+/// the lane_layout. We simply divide each dimension of tensor descriptor shape
+/// by corresponding lane_layout dimension. If array_length > 1, that is
+/// appended to the front of the disributed shape.
+///
/// Examples:
-/// | original vector shape | wi_layout | distributed vector shape |
-/// |-----------------------|-----------|--------------------------|
-/// | 32x16 | [1, 16] | 32x1 |
-/// | 32x16 | [2, 8] | 16x2 |
-/// | 2x32x16 | [1, 16] | 2x32x1 |
-FailureOr<VectorType>
-getDistributedVecTypeBasedOnWiLayout(xegpu::LayoutAttr layout,
- VectorType originalType) {
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+ VectorType originalType) {
llvm::SmallVector<int64_t, 2> distributedShape;
if (!layout)
return failure();
@@ -1022,7 +1032,7 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
auto distriburtedTypeByWarpOp =
- getDistributedVecTypeBasedOnWiLayout(layout, storeOp.getValueType());
+ getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
if (failed(distriburtedTypeByWarpOp))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
@@ -1161,11 +1171,11 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
"the xegpu::Dpas op lacks sg_map attribute for A, B or output");
auto distLhsTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(layoutA, dpasOp.getLhsType());
+ getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
auto distRhsTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(layoutB, dpasOp.getRhsType());
+ getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
auto distResultTypeByWarpOpOrFailure =
- getDistributedVecTypeBasedOnWiLayout(layoutOut, dpasOp.getResultType());
+ getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
if (failed(distLhsTypeByWarpOpOrFailure) ||
failed(distRhsTypeByWarpOpOrFailure) ||
failed(distResultTypeByWarpOpOrFailure))
@@ -1242,14 +1252,16 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
}
void XeGPUSubgroupDistributePass::runOnOperation() {
- auto &analyis = getAnalysis<RunSGMapPropagation>();
+ auto &analyis = getAnalysis<RunLayoutInfoPropagation>();
// Print the analysis result and exit. (for testing purposes)
if (printOnly) {
auto &os = llvm::outs();
analyis.printAnalysisResult(os);
return;
}
- auto getPropagatedLayout = [&](Value val) { return analyis.getSGMap(val); };
+ auto getPropagatedLayout = [&](Value val) {
+ return analyis.getLayoutInfo(val);
+ };
if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
signalPassFailure();
if (failed(resolveLayoutConflicts(getOperation())))
>From 9bddeb6f6b4ba3dc1fef7a666304c520190d02c7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:01:16 +0000
Subject: [PATCH 26/53] drop ScopeAttr and refine 1D layout support
---
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 144 ++++++------
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 +-
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 13 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 93 ++++----
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 21 +-
mlir/test/Dialect/XeGPU/invalid.mlir | 80 +++----
mlir/test/Dialect/XeGPU/ops.mlir | 214 +++++++++---------
7 files changed, 287 insertions(+), 284 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 4afeef1427e8b..80c6ce1160593 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -35,7 +35,7 @@ def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_td
It is default to `Global`.
2. `array_length`: It describes how many horizontally consecutive blocks
will be loaded by a hardware load instruction. If the TensorDesc shape
- is 8x16, with array_length = 2. The loaded block shape will be acctually
+ is 8x16, with array_length = 2. The loaded block shape will be actually
8x32. Its default value is 1.
3. `boundary_check`: It is used to indicates the hardware whether to do
out-of-boundary check. The default value is true.
@@ -154,26 +154,6 @@ def XeGPU_FenceScopeAttr:
let assemblyFormat = "$value";
}
-def XeGPU_ScopeWG: I32EnumAttrCase<"WG", 0, "wg">; // workgroup level code
-def XeGPU_ScopeSG: I32EnumAttrCase<"SG", 1, "sg">; // subgroup level code
-def XeGPU_ScopeLane: I32EnumAttrCase<"Lane", 2, "lane">; // simt level code
-
-def XeGPU_ScopeEnums : I32EnumAttr<"Scope", "enumeration of scope",
- [XeGPU_ScopeWG, XeGPU_ScopeSG, XeGPU_ScopeLane]> {
- let genSpecializedAttr = 0;
- let cppNamespace = "::mlir::xegpu";
-}
-
-def XeGPU_ScopeAttr
- : EnumAttr<XeGPU_Dialect, XeGPU_ScopeEnums, "Scope"> {
- let summary = [{Defines the programming scope of the IR,
- where WG represents the workgroup level,
- SG represents the subgroup level, and
- Lane represents the work-item level}];
-
- let assemblyFormat = "``$value";
-}
-
def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let summary = [{
Describes the data distribution to subgroups and work-items for a tensor
@@ -182,75 +162,99 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
let description = [{
XeGPU operations use `LayoutAttr` to define how data is distributed across subgroups and work-items.
This attribute is specified in tensor descriptors during tensor description creation. `LayoutAttr`
- includes the following parameters, categorized into three groups:
-
- ### Group 1:
- * scope: Defines the scope of the code, which can be `wg` (workgroup), `sg` (subgroup),
- or `lane` (work-item). It is mandatory for subgroup-level programming but optional
- for workgroup and work-item levels. By default:
- - If sg_layout is included, the layout is treated as workgroup level.
- - If only `lane_layout` and `lane_data` are included, it is considered work-item level
-
- ### Group 2:
- * sg_layout (optional): Specifies the total number of subgroups and their layout within a workgroup.
- It is mandatory for workgroup-level programming. Its presence implies workgroup-level code, and
- the scope must be empty or set to `wg`.
- * sg_data (optional): Defines the data size accessed per subgroup. It must be used with sg_layout or
- left empty, in which case it can be derived from `lane_layout` and `lane_data` using the formula:
- `sg_data[i] = lane_layout[i] * lane_data[i]`.
- * order (optional): Specifies the dimension order used to linearize n-dimensional sbugroup IDs to
- 1-dimensional IDs. The first dimension in the order list is the fastest-changing dimension.
-
- ### Group 3:
- * lane_layout (required): Specifies the total number of work-items and their layout within a subgroup
- * lane_data: (required): Specifies the data size accessed per work-item for a single distribution.
-
- `lane_data[0] * lane_data[1]` can be greater than 1, indicating that each work item operates on multiple
- elements. These elements are eventually lowered to a "SIMT-flavor" vector, such as a SPIR-V vector or
- an LLVM vector, or packed into a storage data type. The multiple elements specified by lane_data must
- come from a single dimension and be contiguous in memory along either dimension.
+ includes the following parameters:
+
+ * `sg_layout`: Specifies the total number of subgroups and their layout within a workgroup.
+ It is mandatory for workgroup-level programming and optional for subgroup programming. Its
+ presence implies workgroup-level code.
+ * `sg_data`: Defines the data size accessed per subgroup. It is optionally used with `sg_layout`
+ for workgroup-level programming. When it is left empty, the size accessed per subgroup can be
+ derived from the tensor shape and `sg_layout` using the formula:
+ `sg_data[i] = tensor_shape[i] / sg_layout[i]`.
+ * `inst_data`: Specifies the data size that is processed by an instruction. It is optionally
+ used with lane_layout. When it is left empty, the data size per instruction is equivalent to
+ the sg_data for workgroup-level programming or equivalent to tensor shape for subgroup-level
+ programming.
+ * `lane_layout` : Specifies the total number of work-items and their arrangement within a subgroup.
+ It is mandatory for subgroup-level programming and optional for workgroup-level programming.
+ * `lane_data` : Specifies the shape of the tensor fragment that each lane accesses. It defines a single,
+ minimal distribution unit. Processing the entire tensor may require one or more distribution units per
+ hardware instruction.
+ * `order`: Specifies the dimension order used to linearize n-dimensional sg_layout and lane_layout to
+ 1-dimensional layout. The first dimension in the order list is the fastest-changing dimension. If it
+ is not present, the default value is [1, 0].
### Examples:
- 1. Work-item level layout:
+ 1. Subgroup level layout:
```mlir
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>
```
- In this example, the subgroup consists of 16 work items arranged as lane_layout=[1, 16], with
- each work item accessing a single element as defined by lane_data=[1, 1].
+ In this example, there are 16 work-items per subgroup, and is organized as
+ [[0, 1, 2, .., 7],[8, 9, .., 15]]. The distribution unit is 1x1.
- 2. Workgroup level layout:
+ 2. Subgroup level layout with order:
```mlir
- #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>
+ #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1], order = [0, 1]>
+ ```
+ In this example, there are 16 work-items per subgroup, and is organized as
+ [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]]. The distribution unit is 1x1.
+
+ 3. Workgroup level layout:
+ ```mlir
+ #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1]>
+ ```
+ In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
+ arranged as [[0, 1, 2, 3], [4, 5, 6, 7]]. Each subgroup accesses a 16x16 block per instruction, which
+ is further distributed to 16 work items which is organized as [[0, 1, 2, .., 7],[8, 9, .., 15]].
+
+ 4. Workgroup level layout with order:
+ ```mlir
+ #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1], order = [0, 1]>
```
In this example, the layout represents a workgroup distribution. A workgroup consists of 8 subgroups
- arranged in a 2x4 layout. Each subgroup accesses a 16x16 block per instruction, which is further
- distributed to 16 work items as described above.
+ arranged as [[0, 2, 4, 6], [1, 3, 5, 7]]. Each subgroup accesses a 16x16 block per instruction, which
+ is further distributed to 16 work items which is organized as [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]].
+
}];
let parameters = (ins
- OptionalParameter<"ScopeAttr">: $scope,
OptionalParameter<"DenseI32ArrayAttr">: $sg_layout,
OptionalParameter<"DenseI32ArrayAttr">: $sg_data,
- OptionalParameter<"DenseI32ArrayAttr">: $order,
- "DenseI32ArrayAttr": $lane_layout,
- "DenseI32ArrayAttr": $lane_data
+ OptionalParameter<"DenseI32ArrayAttr">: $inst_data,
+ OptionalParameter<"DenseI32ArrayAttr">: $lane_layout,
+ OptionalParameter<"DenseI32ArrayAttr">: $lane_data,
+ OptionalParameter<"DenseI32ArrayAttr">: $order
);
+ let builders = [
+ AttrBuilder<(ins "llvm::ArrayRef<int>": $lane_layout,
+ "llvm::ArrayRef<int>": $lane_data),
+ [{
+ auto sg_layout = DenseI32ArrayAttr();
+ auto sg_data = DenseI32ArrayAttr();
+ auto inst_data = DenseI32ArrayAttr();
+ auto order = DenseI32ArrayAttr();
+ return $_get($_ctxt, sg_layout, sg_data, inst_data,
+ DenseI32ArrayAttr::get($_ctxt, lane_layout),
+ DenseI32ArrayAttr::get($_ctxt, lane_data), order);
+ }]>
+ ];
+
let extraClassDeclaration = [{
- bool isForWorkgroupLevel() {
- if (!getScope())
- return getSgLayout() != nullptr;
- return getScope() == ScopeAttr::get(getContext(), Scope::WG);
+ bool isWgLayout() {
+ return getSgLayout() != nullptr;
}
- bool isForSubgroupLevel() {
- return getScope() == ScopeAttr::get(getContext(), Scope::SG);
+ bool isSgLayout() {
+ return getSgLayout() == nullptr && getLaneLayout() != nullptr;
}
- bool isForWorkItemLevel() {
- if (!getScope())
- return !getSgLayout() && !getSgData() && !getOrder();
- return getScope() == ScopeAttr::get(getContext(), Scope::Lane);
+ int64_t getRank() {
+ if (auto attr = getSgLayout())
+ return attr.size();
+ if (auto attr = getLaneLayout())
+ return attr.size();
+ return 0;
}
}];
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 41911ee1aa323..16a7f63d60c82 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -841,7 +841,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
can be represented as `B: vector<8x16x2xf16>`.
In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
- which descibe the data fragment owned by each work-item w.r.t. the tensor descriptor
+ which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
these data are loaded from.
Note: on PVC, the hardware can perform load with VNNI transformation when data
@@ -988,8 +988,8 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
let description = [{
`convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
- as workgroup-level (wg) or subgroup-level (sg) code. This operation is not supported for
- work-item-level code.
+ as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
+ lowered to WI level because that is the end result of all distributions.
}];
let arguments = (ins XeGPU_Vector2DType: $source,
XeGPU_LayoutAttr: $srcMap,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 8559f4beb2c03..3d0f52041d798 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -39,7 +39,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
metadata and does not hold the data itself. It is primarily designed to support 2D block load/store
and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information:
- * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+ * shape: the sizes/shape of the interested data block, e.g., 8x16 means 8 rows
and each row contains 16 contiguous data element. The rows could be
either contiguous or not, depends on the encoding attribute. If the
encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding
@@ -62,7 +62,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
static-dim-list ::= decimal-literal `x` decimal-literal
attr-list = (, encoding-attr)? (, layout-attr)?
enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
- layout-attr = (, layout `<` (scope = value,)? (sg_layout = value, sg_data = value, order = value)? lane_layout = value, lane_data = value `>`)?
+ layout-attr = (, layout `<`sg_layout = value, sg_data = value, inst_data = value, lane_layout = value, lane_data = value, order = value`>`)?
```
Examples:
@@ -77,14 +77,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
+ // A TensorDesc with a layout for subgroup level programming
+ xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+
// A TensorDesc with a layout for workgroup level programming
xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
- // A TensorDesc with a layout for subgroup level programming
- xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // A TensorDesc with a layout for workgroup level programming without lane_layout and lane_data
+ xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16]>>
- // A TensorDesc with a layout for workitem level programming
- xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = wi, lane_layout = [1, 16], lane_data = [1, 1]>>
```
}];
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0da86f1af33e4..7aa698de7e2da 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -72,41 +72,58 @@ LogicalResult ScatterTensorDescAttr::verify(
//===----------------------------------------------------------------------===//
LogicalResult
LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
- ScopeAttr scope, DenseI32ArrayAttr sg_layout,
- DenseI32ArrayAttr sg_data, DenseI32ArrayAttr order,
- DenseI32ArrayAttr lane_layout, DenseI32ArrayAttr lane_data) {
+ DenseI32ArrayAttr sg_layout, DenseI32ArrayAttr sg_data,
+ DenseI32ArrayAttr inst_data, DenseI32ArrayAttr lane_layout,
+ DenseI32ArrayAttr lane_data, DenseI32ArrayAttr order) {
+
+ // A valid layout must include at least one of sg_layout and lane_layout.
+ // sg_layout is essential for Workgroup layout, while lane_layout is
+ // required for Subgroup layout.
+ if (!sg_layout && !lane_layout) {
+ return emitError() << "expected at least one of sg_layout or lane_layout";
+ }
+
+ if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
+ return emitError()
+ << "expected sg_layout and lane_layout having the same rank";
+ }
+ // sg_data is optional for Workgroup layout, but its presence requires
+ // sg_layout.
if (sg_data) {
if (!sg_layout)
- return emitError() << "expected sg_layout being used with sg_data.";
+ return emitError() << "expected sg_layout being used with sg_data";
if (sg_data.size() != sg_layout.size())
return emitError() << "expected sg_data having the same rank as sg_layout";
}
- if (order) {
- if (!sg_layout)
- return emitError() << "expected order being used with sg_layout.";
- if (order.size() != sg_layout.size())
- return emitError() << "expected order having the same rank as sg_layout";
- }
-
- if (sg_layout && sg_layout.size() > 2) {
- return emitError() << "expected the rank of the layout to be at most 2";
- }
-
- if (scope && scope.getValue() != Scope::WG &&
- (sg_layout || sg_data || order)) {
- return emitError() << "expected sg_layout, sg_data, or order being only "
- "used at workgroup level.";
+ // inst_data is optional for Subgroup layout, but its presence requires
+ // lane_layout.
+ if (inst_data) {
+ if (!lane_layout)
+ return emitError() << "expected lane_layout being used with inst_data";
+ if (inst_data.size() != lane_layout.size())
+ return emitError()
+ << "expected inst_data having the same rank as lane_layout";
}
- if (scope && scope.getValue() == Scope::WG && !sg_layout ) {
- return emitError() << "expected sg_layout for workgroup level layout";
+ // lane_data is optional for Subgroup layout, but its presence requires
+ // lane_layout.
+ if (lane_data) {
+ if (!lane_layout)
+ return emitError() << "expected lane_layout being used with lane_data";
+ if (lane_data.size() != lane_layout.size())
+ return emitError()
+ << "expected lane_data having the same rank as lane_layout";
}
- if (lane_layout.size() != lane_data.size() || lane_layout.size() > 2) {
- return emitError() << "expected lane_layout and lane_data having the same "
- "rank, with a maximum rank of 2";
+ if (order) {
+ if (!sg_layout && !lane_layout)
+ return emitError()
+ << "expected sg_layout/lane_layout being used with order";
+ if (order.size() != sg_layout.size() && order.size() != lane_layout.size())
+ return emitError()
+ << "expected order having the same rank as sg_layout/lane_layout";
}
return success();
@@ -249,26 +266,24 @@ LogicalResult TensorDescType::verify(
}
if (auto layoutAttr = llvm::dyn_cast_if_present<LayoutAttr>(layout)) {
+
+ if (rank != (size_t)layoutAttr.getRank())
+ return emitError() << "expected layout rank to match tensor rank";
+
ArrayRef<int32_t> laneLayout = layoutAttr.getLaneLayout().asArrayRef();
ArrayRef<int32_t> laneData = layoutAttr.getLaneData().asArrayRef();
- if (rank == 1) {
- if (laneLayout[0] != 1 || laneData[0] != 1)
- return emitError()
- << "outer layout distribution and data mapping must be 1 "
- "for 1D tensor";
- }
-
if (scatterAttr) {
// Validate subgroup mapping rules for scattered tensors.
// A work-item's slice of the tensor with shape [sg_size] or
// [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
// respectively, the mapping should reflect that. This is because each
// work item access data in 32 bit granularity.
- if (laneData[0] != 1)
+
+ if (rank > 1 && laneData[0] != 1)
return emitError()
<< "cannot map over non-contiguous scattered row elements";
- if (laneData[1] != packingFactor)
+ if (laneData.back() != packingFactor)
return emitError() << "work item data mapping must match the number of "
"contiguous elements";
}
@@ -276,8 +291,6 @@ LogicalResult TensorDescType::verify(
// For 1D tensor, pad the shape with an outer unit dimension to allow common
// validation logic.
SmallVector<int64_t> tensorShape(shape.begin(), shape.end());
- if (rank == 1)
- tensorShape = {1, tensorShape.back()};
size_t dims = tensorShape.size();
for (size_t i = 0; i < dims; ++i) {
@@ -319,7 +332,7 @@ LogicalResult TensorDescType::verify(
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
// If no layout is provided, tensor desc is not used in SIMT mode.
- if (!layout || !layout.isForWorkItemLevel())
+ if (!layout)
return failure();
SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
@@ -347,14 +360,6 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
}
// Case 2: block loads/stores
- // Tensor descriptor shape can be 1D. For the 1D case, outer dims of laneData
- // and laneLayout must be 1.
- if (tdescShape.size() == 1) {
- assert((laneData[0] == 1 && laneLayout[0] == 1) &&
- "lane_data[0] and lane_layout[0] must be 1 for 1D tensor descriptor");
- laneData = {laneData[1]};
- laneLayout = {laneLayout[1]};
- }
// Check if the tensor descriptor shape is distributable.
int64_t tensorSize = 1;
for (auto [tdescDim, wiDim, laneDataDim] :
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index e2ccc59d39371..2ac3426904fa8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -82,7 +82,7 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
auto valueShape = valueTy.getShape();
// layout not present means IR is in SIMD mode. In this case value shape must
// match adjusted tensor descriptor shape.
- if (!layout || !layout.isForWorkItemLevel())
+ if (!layout)
return valueShape == adjustedTdescShape
? success()
: emitError()
@@ -606,13 +606,6 @@ LogicalResult DpasOp::verify() {
result |= (aLayout != nullptr) ^ (cLayout != nullptr);
}
result = !result;
-
- if (aLayout) {
- auto scope = aLayout.getScope();
- result &= bLayout ? scope == bLayout.getScope() : false;
- if (hasAcc())
- result &= cLayout ? scope == cLayout.getScope() : false;
- }
return result;
};
@@ -622,7 +615,7 @@ LogicalResult DpasOp::verify() {
"code) or not set at all (for SIMD code).");
// query the scope from aLayout (a valid setting).
- if (aLayout && aLayout.isForWorkItemLevel()) {
+ if (aLayout) {
// In SIMT mode, All data fragments must be 2D
if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
@@ -673,14 +666,14 @@ LogicalResult ConvertLayoutOp::verify() {
if (!resMap)
return emitOpError("expected resMap.");
- if (srcMap.getScope() != resMap.getScope())
- return emitOpError("expected srcMap and resMap be in the same scope.");
-
if (srcMap == resMap)
return emitOpError("expected different srcMap and resMap.");
- if (srcMap.isForWorkItemLevel())
- return emitOpError("doesn't work on SIMT code.");
+ // both srcMap and resMap should be WgLayout or SgLayout at the same time.
+ if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) &&
+ (!srcMap.isSgLayout() || !resMap.isSgLayout()))
+ return emitOpError(
+ "expected srcMap and resMap be WgLayout or SgLayout at the same time.");
auto shape = getSource().getType().getShape();
if (!isEvenDistributed(shape, srcMap))
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 17e4f60638905..8b5e42af2f7b8 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -80,11 +80,11 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
// -----
func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-> vector<8x2xf32>
return
}
@@ -92,11 +92,11 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
// -----
func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
// expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-> vector<8xf32>
return
}
@@ -136,20 +136,20 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
// -----
func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
return
}
// -----
func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
// expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
xegpu.store_nd %data, %1
- : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
return
}
@@ -247,8 +247,8 @@ func.func @test_prefetch_vc_2(%src: ui64) {
// -----
func.func @test_create_tdesc_layout_1(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ // expected-error at +1 {{expected layout rank to match tensor rank}}
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
return
}
@@ -256,7 +256,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
func.func @test_create_tdesc_layout_2(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [2, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [1, 4], lane_data = [2, 1]>>
return
}
@@ -264,7 +264,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
func.func @test_create_tdesc_layout_3(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
return
}
@@ -272,9 +272,9 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
func.func @test_load_gather_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
return
}
@@ -282,9 +282,9 @@ func.func @test_load_gather_layout_1(%src: ui64) {
func.func @test_load_gather_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
return
}
@@ -294,9 +294,9 @@ func.func @test_store_scatter_layout_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<1x2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
return
}
@@ -305,9 +305,9 @@ func.func @test_store_scatter_layout_2(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
// expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
return
}
@@ -396,16 +396,16 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
// -----
func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
// expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
return
}
// -----
func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
// expected-error at +1 {{K-dimension mismatch}}
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
- b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
- c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
return
}
@@ -438,16 +438,16 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) {
// -----
func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [2, 16], lane_data = [1, 1]>>
+ // expected-error at +1 {{expected layout rank to match tensor rank}}
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [2, 16], lane_data = [1, 1]>>
return
}
// -----
func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- // expected-error at +1 {{outer layout distribution and data mapping must be 1 for 1D tensor}}
- !xegpu.tensor_desc<16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ // expected-error at +1 {{expected layout rank to match tensor rank}}
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
return
}
@@ -455,7 +455,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 8 over 16 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
return
}
@@ -463,7 +463,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 1]>>
return
}
@@ -471,7 +471,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 2 work items with 4 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [2, 8], lane_data = [4, 1]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [4, 1]>>
return
}
@@ -479,7 +479,7 @@ func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) {
func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) {
%0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
// expected-error at +1 {{cannot distribute 4 over 8 work items with 1 elements each}}
- !xegpu.tensor_desc<4x8xf32, #xegpu.layout<scope = lane, lane_layout = [8, 2], lane_data = [1, 2]>>
+ !xegpu.tensor_desc<4x8xf32, #xegpu.layout<lane_layout = [8, 2], lane_data = [1, 2]>>
return
}
@@ -490,7 +490,7 @@ func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) {
// expected-error at +1 {{cannot map over non-contiguous scattered row elements}}
!xegpu.tensor_desc<4x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<scope = lane, lane_layout = [1, 1], lane_data = [2, 1]>>
+ #xegpu.layout<lane_layout = [1, 1], lane_data = [2, 1]>>
return
}
@@ -500,7 +500,7 @@ func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<1
// expected-error at +1 {{work item data mapping must match the number of contiguous elements}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
+ #xegpu.layout<lane_layout = [8], lane_data = [2]>>
return
}
@@ -510,7 +510,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected non-contiguous elements for 1D tensor}}
!xegpu.tensor_desc<16xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 2>,
- #xegpu.layout<scope = lane, lane_layout = [1, 8], lane_data = [1, 2]>>
+ #xegpu.layout<lane_layout = [1, 8], lane_data = [1, 2]>>
return
}
@@ -520,22 +520,22 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
// expected-error at +1 {{expected chunk blocks for 2D tensor}}
!xegpu.tensor_desc<16x2xf32,
#xegpu.scatter_tdesc_attr<chunk_size = 1>,
- #xegpu.layout<scope = lane, lane_layout = [8, 1], lane_data = [1, 2]>>
+ #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2]>>
return
}
// -----
func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
// expected-error at +1 {{expected different srcMap and resMap}}
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
// -----
func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
- // expected-error at +1 {{expected srcMap and resMap be in the same scope}}
+ // expected-error at +1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
- resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index e52562a2f453d..54f14c6cb8c65 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -15,9 +15,9 @@ gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -34,8 +34,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
//CHECK: %[[C:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -48,8 +48,8 @@ gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -62,8 +62,8 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -76,8 +76,8 @@ gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
gpu.return
}
@@ -90,8 +90,8 @@ gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -106,10 +106,10 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
%2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+ : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
gpu.return
}
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<1x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
gpu.return
}
@@ -162,11 +162,11 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+ !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -181,11 +181,11 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
gpu.return
}
@@ -200,11 +200,11 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x1xf32>
+ !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
gpu.return
}
@@ -219,11 +219,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
gpu.return
}
@@ -238,11 +238,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
gpu.return
}
@@ -257,10 +257,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<scope = lane, lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
gpu.return
}
@@ -279,11 +279,11 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
%1 = arith.constant dense<1.0>: vector<48x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -305,11 +305,11 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
// CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
%1 = arith.constant dense<1.0>: vector<2x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
%2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
gpu.return
}
@@ -324,10 +324,10 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
- // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
- %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
}
@@ -344,8 +344,8 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
gpu.func @test_create_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
gpu.return
}
@@ -363,8 +363,8 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
gpu.return
}
@@ -383,7 +383,7 @@ gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
- %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
gpu.return
}
@@ -401,8 +401,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
gpu.func @test_create_tdesc_simt_3(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
gpu.return
}
@@ -425,10 +425,10 @@ gpu.func @test_load_simt(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
gpu.return
}
@@ -451,10 +451,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1> -> vector<1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
gpu.return
}
@@ -477,10 +477,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
gpu.return
}
@@ -509,10 +509,10 @@ gpu.func @test_store_simt(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
%2 = arith.constant dense<2.9>: vector<2x1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
gpu.return
}
@@ -541,10 +541,10 @@ gpu.func @test_store_simt_2(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
%2 = arith.constant dense<2.9>: vector<1x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
gpu.return
}
@@ -572,10 +572,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
%2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<scope = lane, lane_layout = [1, 4], lane_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
gpu.return
}
@@ -583,10 +583,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
gpu.func @test_prefetch_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
- xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
+ xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
gpu.return
}
@@ -605,13 +605,13 @@ gpu.func @test_prefetch_vc(%src: ui64) {
// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
gpu.func @test_create_update_tdesc_simt(%src: ui64) {
//CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
//CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
- //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
+ //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>
+ %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
%s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
- %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<scope = lane, lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
+ %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
gpu.return
}
@@ -637,12 +637,12 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
- // CHECK: b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
- // CHECK: c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>,
- b_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [2, 1]>,
- c_layout = #xegpu.layout<scope = lane, lane_layout = [1, 16], lane_data = [1, 1]>}
+ // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ // CHECK: b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ // CHECK: c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+ %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
: vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
gpu.return
}
@@ -712,8 +712,8 @@ gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
}
gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
- %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [2, 1]>,
- resMap = #xegpu.layout<scope = sg, lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+ %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}
>From 784ab38e3a0dc4fd6288375eccba66c9b8db58b4 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:09:09 +0000
Subject: [PATCH 27/53] refine isEvenDistributed
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 2ac3426904fa8..35a5421410305 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -107,14 +107,17 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
xegpu::LayoutAttr attr) {
- assert(attr && "workgroup map attribute is missing.");
+ assert(attr && "Layout attribute is missing.");
+ llvm::SmallVector<int32_t> defaults(shape.size(), 1);
llvm::ArrayRef<int32_t> layout, data;
- if (attr.getSgLayout()) {
- data = attr.getSgData().asArrayRef();
- layout = attr.getSgLayout().asArrayRef();
+ if (auto sg_layout = attr.getSgLayout()) {
+ layout = sg_layout.asArrayRef();
+ auto sg_data = attr.getSgData();
+ data = sg_data? sg_data.asArrayRef(): defaults;
} else {
- data = attr.getLaneData().asArrayRef();
layout = attr.getLaneLayout().asArrayRef();
+ auto lane_data = attr.getLaneData();
+ data = lane_data? lane_data.asArrayRef(): defaults;
}
for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
// check s % (d * l) != 0
>From 28cf69ef3bd26cfd08894deeddc79799aa8f2dcf Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:49:36 +0000
Subject: [PATCH 28/53] format code
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 35a5421410305..a60288f2eb77d 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -113,11 +113,11 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
if (auto sg_layout = attr.getSgLayout()) {
layout = sg_layout.asArrayRef();
auto sg_data = attr.getSgData();
- data = sg_data? sg_data.asArrayRef(): defaults;
+ data = sg_data ? sg_data.asArrayRef() : defaults;
} else {
layout = attr.getLaneLayout().asArrayRef();
auto lane_data = attr.getLaneData();
- data = lane_data? lane_data.asArrayRef(): defaults;
+ data = lane_data ? lane_data.asArrayRef() : defaults;
}
for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
// check s % (d * l) != 0
>From 9ed0f874e8f3fdf897bbcc96f8d02c6a38507db6 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 19:58:42 +0000
Subject: [PATCH 29/53] fix format issue
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7aa698de7e2da..c95f9e90f589e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -94,7 +94,8 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
if (!sg_layout)
return emitError() << "expected sg_layout being used with sg_data";
if (sg_data.size() != sg_layout.size())
- return emitError() << "expected sg_data having the same rank as sg_layout";
+ return emitError()
+ << "expected sg_data having the same rank as sg_layout";
}
// inst_data is optional for Subgroup layout, but its presence requires
@@ -297,8 +298,8 @@ LogicalResult TensorDescType::verify(
uint32_t numElemPerWi = laneLayout[i] * laneData[i];
if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0)
return emitError() << "cannot distribute " << tensorShape[i] << " over "
- << laneLayout[i] << " work items with " << laneData[i]
- << " elements each";
+ << laneLayout[i] << " work items with "
+ << laneData[i] << " elements each";
}
}
>From 3b389bfcaa2bde33ab2651eadcc7bb4a6a4b9c78 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 1 Apr 2025 20:27:18 +0000
Subject: [PATCH 30/53] add 1D layout examples
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 3d0f52041d798..173f1462fdd73 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -77,6 +77,12 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
// A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
+ // A 1D TensorDesc with a layout for subgroup level programming, each lane access two continuous elements
+ xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [2]>>
+
+ // A 1D TensorDesc with a layout for subgroup level programming, each lane access two elements with stride = 16
+ xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+
// A TensorDesc with a layout for subgroup level programming
xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
>From 589d2171c173553cd7131425e85d920096fde19b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 2 Apr 2025 17:10:35 +0000
Subject: [PATCH 31/53] refactor names
---
.../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8e1e846c94d3e..5b812a731ec95 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -697,13 +697,13 @@ static LogicalResult attachLayoutAttributes(
auto layout = getPropagatedLayout(r);
if (!layout.isAssigned())
return {};
- SmallVector<int, 2> wiLayout, wiData;
+ SmallVector<int, 2> laneLayout, laneData;
for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
layout.getDataAsArrayRef())) {
- wiLayout.push_back(static_cast<int>(layout));
- wiData.push_back(static_cast<int>(data));
+ laneLayout.push_back(static_cast<int>(layout));
+ laneData.push_back(static_cast<int>(data));
}
- return xegpu::LayoutAttr::get(r.getContext(), wiLayout, wiData);
+ return xegpu::LayoutAttr::get(r.getContext(), laneLayout, laneData);
};
/// Attach the layout attributes to the results of the operations.
auto walkResult = top->walk([&](Operation *op) {
@@ -785,22 +785,22 @@ FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
if (!layout)
return failure();
- auto wiLayout = layout.getLaneLayout();
+ auto laneLayout = layout.getLaneLayout();
assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
"expecting 2D or 3D shape for the original vector type");
- assert(wiLayout.size() == 2 && "expecting 2D shape for the wi layout");
+ assert(laneLayout.size() == 2 && "expecting 2D shape for the wi layout");
// Original type can be 2D or 3D (array_length > 1), the last two dims are the
// block shape.
auto blockShape = originalType.getShape().take_back(2);
// Check if the block vector shape can be distributed evenly.
- if (blockShape[0] % wiLayout[0] != 0 || blockShape[1] % wiLayout[1] != 0)
+ if (blockShape[0] % laneLayout[0] != 0 || blockShape[1] % laneLayout[1] != 0)
return failure();
if (originalType.getRank() == 3) {
distributedShape.push_back(originalType.getShape()[0]);
}
for (unsigned i = 0; i < 2; ++i) {
- distributedShape.push_back(blockShape[i] / wiLayout[i]);
+ distributedShape.push_back(blockShape[i] / laneLayout[i]);
}
auto newVectorType =
VectorType::get(distributedShape, originalType.getElementType());
>From c6ccef2e84c1c5ffcfd8b6d98e14f73b559779d3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 2 Apr 2025 21:47:40 +0000
Subject: [PATCH 32/53] refactor
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 7 -------
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 9 ---------
2 files changed, 16 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 57c9bbc7b29f3..80c6ce1160593 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -258,13 +258,6 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
}
}];
- let builders = [
- AttrBuilder<(ins
- "ArrayRef<int>": $lane_layout,
- "ArrayRef<int>": $lane_data
- )>
- ];
-
let assemblyFormat = "`<` struct(params) `>`";
let genVerifyDecl = 1;
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ccd0a48c8391e..c2508785d63b0 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -133,15 +133,6 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return success();
}
-LayoutAttr LayoutAttr::get(mlir::MLIRContext *context, ArrayRef<int> laneLayout,
- ArrayRef<int> laneData) {
- return Base::get(context, ScopeAttr::get(context, Scope::Lane),
- DenseI32ArrayAttr(), DenseI32ArrayAttr(),
- DenseI32ArrayAttr(),
- DenseI32ArrayAttr::get(context, laneLayout),
- DenseI32ArrayAttr::get(context, laneData));
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_TensorDescType
//===----------------------------------------------------------------------===//
>From cbd0af0c5bf32fcb395aafd22b83a5c0c9e8e804 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:05:05 +0000
Subject: [PATCH 33/53] refine LayoutAttr verifier
---
.../SPIRV/IR/SPIRVIntelExtEmbargoOps.td | 85 +++++++++++++
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 25 +++-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 51 ++++----
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 15 ++-
mlir/test/Dialect/XeGPU/invalid.mlir | 112 +++++++++++++++++-
5 files changed, 254 insertions(+), 34 deletions(-)
create mode 100644 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
new file mode 100644
index 0000000000000..e3e16a0966ada
--- /dev/null
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
@@ -0,0 +1,85 @@
+//===- SPIRVIntelExtEmbargoOps.td - Intel SPIR-V extensions ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the op definition spec of Intel-specific SPIR-V extensions
+// These extensions are not part of Khronos specification and available under
+// Embargo.
+// Supported extensions
+// * SPV_INTEL_region_group
+//===----------------------------------------------------------------------===//
+
+
+#ifndef MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
+#define MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
+
+// -----
+
+def SPIRV_INTELSubRegionControlBarrierOp : SPIRV_IntelVendorOp<"SubRegionControlBarrier", []> {
+ let summary = "See extension SPV_INTEL_region_group";
+
+ let description = [{
+ Wait for all active invocations within the current sub-region to reach
+ the current point of execution.
+
+ All active invocations within the current sub-region reach this point of
+ execution before any invocation proceeds beyond it.
+
+ A sub-region is a subset of the workgroups in a region group. The region
+ group is partitioned into groups of SubRegionSize workgroups, and
+ the workgroups are ordered by their linearized ID. The first SubRegionSize
+ workgroups in this sequence are the first sub-region, the next
+ SubRegionSize workgroups are the next sub-region, etc. The total number of
+ workgroups in the region-group must be evenly divisible by SubRegionSize,
+ otherwise the behavior is undefined.
+
+ Behavior is undefined unless all invocations within the current sub-region
+ execute the same dynamic instance of this instruction. SubRegionSize value
+ must be the same for all invocations within the current sub-region,
+ or otherwise behavior is undefined.
+
+ If Semantics is not None, this instruction also serves as an
+ OpMemoryBarrier instruction, and also performs and adheres to the
+ description and semantics of an OpMemoryBarrier instruction with the
+ same Memory and Semantics operands. This allows atomically specifying
+ both a control barrier and a memory barrier (that is, without needing
+ two instructions). If Semantics is None, Memory is ignored.
+
+ #### Example:
+
+ ```mlir
+ spirv.SubRegionControlBarrier %0, "RegionINTEL", "None"
+ ```
+
+ }];
+
+
+ let availability = [
+ MinVersion<SPIRV_V_1_0>,
+ MaxVersion<SPIRV_V_1_6>,
+ Extension<[SPV_INTEL_region_group]>,
+ Capability<[SPIRV_C_RegionGroupINTEL]>
+ ];
+
+ let arguments = (ins
+ SPIRV_Int32:$subregion_size,
+ SPIRV_ScopeAttr:$memory_scope,
+ SPIRV_MemorySemanticsAttr:$memory_semantics
+ );
+
+ let assemblyFormat = [{
+ $subregion_size `,` $memory_scope `,` $memory_semantics attr-dict
+ }];
+
+ let results = (outs);
+
+ let hasVerifier = 0;
+}
+
+// -----
+
+#endif // MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 80c6ce1160593..15aa053017b49 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -165,8 +165,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
includes the following parameters:
* `sg_layout`: Specifies the total number of subgroups and their layout within a workgroup.
- It is mandatory for workgroup-level programming and optional for subgroup programming. Its
- presence implies workgroup-level code.
+ It is mandatory for workgroup-level programming. Its presence implies workgroup-level code.
* `sg_data`: Defines the data size accessed per subgroup. It is optionally used with `sg_layout`
for workgroup-level programming. When it is left empty, the size accessed per subgroup can be
derived from the tensor shape and `sg_layout` using the formula:
@@ -199,7 +198,15 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
In this example, there are 16 work-items per subgroup, and is organized as
[[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]]. The distribution unit is 1x1.
- 3. Workgroup level layout:
+ 3. Subgroup level layout with inst_data
+ ```mlir
+ #xegpu.layout<inst_data = [8, 16], lane_layout = [2, 8], lane_data = [2, 2]>
+ ```
+ In this example, the original problem size is divided into smaller subproblems of size [8, 16],
+ which are further distributed across 16 work-items organized as [[0, 1, 2, ..., 7], [8, 9, ..., 15]].
+ Each work-item is assigned a contiguous 2x2 block.
+
+ 4. Workgroup level layout:
```mlir
#xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1]>
```
@@ -207,7 +214,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
arranged as [[0, 1, 2, 3], [4, 5, 6, 7]]. Each subgroup accesses a 16x16 block per instruction, which
is further distributed to 16 work items which is organized as [[0, 1, 2, .., 7],[8, 9, .., 15]].
- 4. Workgroup level layout with order:
+ 5. Workgroup level layout with order:
```mlir
#xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [2, 8], lane_data = [1, 1], order = [0, 1]>
```
@@ -215,6 +222,14 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
arranged as [[0, 2, 4, 6], [1, 3, 5, 7]]. Each subgroup accesses a 16x16 block per instruction, which
is further distributed to 16 work items which is organized as [[0, 2, 4, ..., 14], [1, 3, 5, ..., 15]].
+ 6. Workgroup level layout with inst_data:
+ ```mlir
+ #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], inst_data = [8, 16], lane_layout = [2, 8], lane_data = [1, 1]>
+ ```
+ This example is similar to the previous ones, but the `inst_data` parameter divides `sg_data` into two instructions,
+ each processing an 8x16 block. These blocks are further distributed across 16 work-items with a distribution unit of 1x1.
+ Unlike the 2x2 distribution unit in example 3, which results in accessing contiguous 2x2 blocks, the 1x1 distribution
+ unit may result in non-contiguous access.
}];
let parameters = (ins
@@ -252,6 +267,8 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
int64_t getRank() {
if (auto attr = getSgLayout())
return attr.size();
+ if (auto attr = getInstData())
+ return attr.size();
if (auto attr = getLaneLayout())
return attr.size();
return 0;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index c95f9e90f589e..8206db0198eaf 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -79,13 +79,27 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
// A valid layout must include at least one of sg_layout and lane_layout.
// sg_layout is essential for Workgroup layout, while lane_layout is
// required for Subgroup layout.
- if (!sg_layout && !lane_layout) {
- return emitError() << "expected at least one of sg_layout or lane_layout";
+ if (!sg_layout && !inst_data && !lane_layout) {
+ return emitError()
+ << "expected at least one of sg_layout, inst_data or lane_layout";
+ }
+
+ // generate code to check sg_laout, inst_data and lane_layout having the same
+ // rank if they are not null.
+
+ if (sg_layout && inst_data && sg_layout.size() != inst_data.size()) {
+ return emitError()
+ << "expected sg_layout and inst_data to have the same rank";
}
if (sg_layout && lane_layout && sg_layout.size() != lane_layout.size()) {
return emitError()
- << "expected sg_layout and lane_layout having the same rank";
+ << "expected sg_layout and lane_layout to have the same rank";
+ }
+
+ if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
+ return emitError()
+ << "expected inst_data and lane_layout to have the same rank";
}
// sg_data is optional for Workgroup layout, but its presence requires
@@ -95,17 +109,7 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return emitError() << "expected sg_layout being used with sg_data";
if (sg_data.size() != sg_layout.size())
return emitError()
- << "expected sg_data having the same rank as sg_layout";
- }
-
- // inst_data is optional for Subgroup layout, but its presence requires
- // lane_layout.
- if (inst_data) {
- if (!lane_layout)
- return emitError() << "expected lane_layout being used with inst_data";
- if (inst_data.size() != lane_layout.size())
- return emitError()
- << "expected inst_data having the same rank as lane_layout";
+ << "expected sg_data and sg_layout to have the same rank";
}
// lane_data is optional for Subgroup layout, but its presence requires
@@ -115,16 +119,21 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
return emitError() << "expected lane_layout being used with lane_data";
if (lane_data.size() != lane_layout.size())
return emitError()
- << "expected lane_data having the same rank as lane_layout";
+ << "expected lane_data and lane_layout to have the same rank";
}
if (order) {
if (!sg_layout && !lane_layout)
return emitError()
<< "expected sg_layout/lane_layout being used with order";
- if (order.size() != sg_layout.size() && order.size() != lane_layout.size())
+
+ if (sg_layout && order.size() != sg_layout.size())
+ return emitError()
+ << "expected order and sg_layout to have the same rank";
+
+ if (lane_layout && order.size() != lane_layout.size())
return emitError()
- << "expected order having the same rank as sg_layout/lane_layout";
+ << "expected order and lane_layout to have the same rank";
}
return success();
@@ -341,9 +350,9 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto tdescShape = getShape();
auto laneDataSize = 1, sgSize = 1;
- for (auto [wiDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
+ for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
laneDataSize *= laneDataDim;
- sgSize *= wiDim;
+ sgSize *= laneDim;
}
// Case 1: regular loads/stores
@@ -363,9 +372,9 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
// Case 2: block loads/stores
// Check if the tensor descriptor shape is distributable.
int64_t tensorSize = 1;
- for (auto [tdescDim, wiDim, laneDataDim] :
+ for (auto [tdescDim, laneDim, laneDataDim] :
llvm::zip_equal(tdescShape, laneLayout, laneData)) {
- assert((tdescDim % (wiDim * laneDataDim) == 0) &&
+ assert((tdescDim % (laneDim * laneDataDim) == 0) &&
"tensor descriptor shape is not distributable");
tensorSize *= tdescDim;
}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a60288f2eb77d..12b45a223183a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -119,9 +119,10 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
auto lane_data = attr.getLaneData();
data = lane_data ? lane_data.asArrayRef() : defaults;
}
- for (auto [s, d, l] : llvm::zip_equal(shape, data, layout)) {
- // check s % (d * l) != 0
- if (s % d != 0 || (s / d) % l != 0)
+ for (auto [dimSize, dataFactor, layoutFactor] :
+ llvm::zip_equal(shape, data, layout)) {
+ // check dimSize % (dataFactor * layoutFactor) != 0
+ if (dimSize % dataFactor != 0 || (dimSize / dataFactor) % layoutFactor != 0)
return false;
}
return true;
@@ -602,17 +603,15 @@ LogicalResult DpasOp::verify() {
// make sure the layout attribute is either set for every available
// operand or simply not set at all. C is special, since ACC is optional.
- // If they are all set, they also should be in the same scope.
- auto isValidSet = [&]() {
+ auto hasValidLayoutAttrs = [&]() {
bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
if (hasAcc()) {
result |= (aLayout != nullptr) ^ (cLayout != nullptr);
}
- result = !result;
- return result;
+ return !result;
};
- if (!isValidSet())
+ if (!hasValidLayoutAttrs())
return emitOpError(
"layout attributes should be either set for all operands (for SIMT "
"code) or not set at all (for SIMD code).");
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 8b5e42af2f7b8..596befa335618 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -538,4 +538,114 @@ func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
-}
\ No newline at end of file
+}
+
+// -----
+func.func @tensor_desc_invalid_layout_attr(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected at least one of sg_layout, inst_data or lane_layout}}
+ #xegpu.layout<sg_data = [16, 2], lane_data = [1, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected sg_layout and lane_layout to have the same rank}}
+ #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected sg_layout and inst_data to have the same rank}}
+ #xegpu.layout<sg_layout = [1, 1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected inst_data and lane_layout to have the same rank}}
+ #xegpu.layout<inst_data = [16, 2, 1], lane_layout = [8, 1], lane_data = [1, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected lane_data and lane_layout to have the same rank}}
+ #xegpu.layout<inst_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2, 1]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected sg_data and sg_layout to have the same rank}}
+ #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2, 1], inst_data = [16, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected sg_layout being used with sg_data}}
+ #xegpu.layout<sg_data = [16, 2], lane_layout = [8, 1], lane_data = [1, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected lane_layout being used with lane_data}}
+ #xegpu.layout<inst_data = [16, 2], lane_data = [1, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected sg_layout/lane_layout being used with order}}
+ #xegpu.layout<inst_data = [16, 2], order = [0, 1]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected order and sg_layout to have the same rank}}
+ #xegpu.layout<sg_layout = [1, 1], sg_data = [16, 2], order = [0, 1, 2]>>
+ return
+}
+
+// -----
+func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ // expected-error at +1 {{expected order and lane_layout to have the same rank}}
+ #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 2], order = [0, 1, 2]>>
+ return
+}
>From 3fb4fd447825dd6e92e6354a5629e230fac09552 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:08:34 +0000
Subject: [PATCH 34/53] add unit test
---
mlir/test/Dialect/XeGPU/invalid.mlir | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 596befa335618..48df33a591908 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -600,6 +600,16 @@ func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
return
}
+// -----
+func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) {
+ %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
+ // expected-error at +1 {{expected layout rank to match tensor rank}}
+ !xegpu.tensor_desc<16x2xf32,
+ #xegpu.scatter_tdesc_attr<chunk_size = 2>,
+ #xegpu.layout<sg_layout = [1], sg_data = [32], inst_data = [16]>>
+ return
+}
+
// -----
func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) {
%1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> ->
>From 77fdfefb7b23b5a032f8f3da4c786ccc321a7e1a Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:12:46 +0000
Subject: [PATCH 35/53] remove dump file
---
.../SPIRV/IR/SPIRVIntelExtEmbargoOps.td | 85 -------------------
1 file changed, 85 deletions(-)
delete mode 100644 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
deleted file mode 100644
index e3e16a0966ada..0000000000000
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtEmbargoOps.td
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- SPIRVIntelExtEmbargoOps.td - Intel SPIR-V extensions ---------------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is the op definition spec of Intel-specific SPIR-V extensions
-// These extensions are not part of Khronos specification and available under
-// Embargo.
-// Supported extensions
-// * SPV_INTEL_region_group
-//===----------------------------------------------------------------------===//
-
-
-#ifndef MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
-#define MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
-
-// -----
-
-def SPIRV_INTELSubRegionControlBarrierOp : SPIRV_IntelVendorOp<"SubRegionControlBarrier", []> {
- let summary = "See extension SPV_INTEL_region_group";
-
- let description = [{
- Wait for all active invocations within the current sub-region to reach
- the current point of execution.
-
- All active invocations within the current sub-region reach this point of
- execution before any invocation proceeds beyond it.
-
- A sub-region is a subset of the workgroups in a region group. The region
- group is partitioned into groups of SubRegionSize workgroups, and
- the workgroups are ordered by their linearized ID. The first SubRegionSize
- workgroups in this sequence are the first sub-region, the next
- SubRegionSize workgroups are the next sub-region, etc. The total number of
- workgroups in the region-group must be evenly divisible by SubRegionSize,
- otherwise the behavior is undefined.
-
- Behavior is undefined unless all invocations within the current sub-region
- execute the same dynamic instance of this instruction. SubRegionSize value
- must be the same for all invocations within the current sub-region,
- or otherwise behavior is undefined.
-
- If Semantics is not None, this instruction also serves as an
- OpMemoryBarrier instruction, and also performs and adheres to the
- description and semantics of an OpMemoryBarrier instruction with the
- same Memory and Semantics operands. This allows atomically specifying
- both a control barrier and a memory barrier (that is, without needing
- two instructions). If Semantics is None, Memory is ignored.
-
- #### Example:
-
- ```mlir
- spirv.SubRegionControlBarrier %0, "RegionINTEL", "None"
- ```
-
- }];
-
-
- let availability = [
- MinVersion<SPIRV_V_1_0>,
- MaxVersion<SPIRV_V_1_6>,
- Extension<[SPV_INTEL_region_group]>,
- Capability<[SPIRV_C_RegionGroupINTEL]>
- ];
-
- let arguments = (ins
- SPIRV_Int32:$subregion_size,
- SPIRV_ScopeAttr:$memory_scope,
- SPIRV_MemorySemanticsAttr:$memory_semantics
- );
-
- let assemblyFormat = [{
- $subregion_size `,` $memory_scope `,` $memory_semantics attr-dict
- }];
-
- let results = (outs);
-
- let hasVerifier = 0;
-}
-
-// -----
-
-#endif // MLIR_DIALECT_SPIRV_IR_INTEL_EXT_EMBARGO_OPS
>From 2751332899762e16a0a7424f38512554f5f5ab90 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 17:16:01 +0000
Subject: [PATCH 36/53] fix typo
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 12b45a223183a..8cabfeec9b9de 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -564,7 +564,7 @@ LogicalResult StoreScatterOp::verify() {
[&]() { return emitOpError(); });
}
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
// XeGPU_UpdateOffsetOp
//===----------------------------------------------------------------------===//
void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
>From d281a149c266233ba33cf8f10368240ddc08a7d7 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Fri, 4 Apr 2025 18:08:07 +0000
Subject: [PATCH 37/53] fix an error after mering with main
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 13 +++----------
1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 81333f2589ee6..171a15ce27b59 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -298,17 +298,10 @@ LogicalResult TensorDescType::verify(
"contiguous elements";
}
- // For 1D tensor, pad the shape with an outer unit dimension to allow common
- // validation logic.
- SmallVector<int64_t> tensorShape(shape);
- if (rank == 1)
- tensorShape = {1, tensorShape.back()};
-
- size_t dims = tensorShape.size();
- for (size_t i = 0; i < dims; ++i) {
+ for (size_t i = 0; i < shape.size(); ++i) {
uint32_t numElemPerWi = laneLayout[i] * laneData[i];
- if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0)
- return emitError() << "cannot distribute " << tensorShape[i] << " over "
+ if (shape[i] < numElemPerWi || shape[i] % numElemPerWi != 0)
+ return emitError() << "cannot distribute " << shape[i] << " over "
<< laneLayout[i] << " work items with "
<< laneData[i] << " elements each";
}
>From fb28ce83df7346e6c836f512d6a10ad274b50af3 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Mon, 7 Apr 2025 15:23:47 +0000
Subject: [PATCH 38/53] new line at the end of file
---
mlir/test/Dialect/XeGPU/ops.mlir | 1 -
1 file changed, 1 deletion(-)
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 54f14c6cb8c65..e9895e0d0a71d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -723,5 +723,4 @@ gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
gpu.return
}
-
}
>From f464662ab164c11cbe6630ddf6acd9f100c83c6e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 8 Apr 2025 21:52:23 +0000
Subject: [PATCH 39/53] update doc
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 6 +++---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 7 ++++++-
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 15aa053017b49..ab5fb4a4a7de9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -202,9 +202,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
```mlir
#xegpu.layout<inst_data = [8, 16], lane_layout = [2, 8], lane_data = [2, 2]>
```
- In this example, the original problem size is divided into smaller subproblems of size [8, 16],
- which are further distributed across 16 work-items organized as [[0, 1, 2, ..., 7], [8, 9, ..., 15]].
- Each work-item is assigned a contiguous 2x2 block.
+ In this example, the original problem size is partitioned into smaller subproblems of dimensions [8, 16],
+ which are then distributed among 16 work-items arranged as [[0, 1, 2, ..., 7], [8, 9, ..., 15]]. Each
+ work-item is assigned four 2x2 blocks in a round-robin manner.
4. Workgroup level layout:
```mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 8cabfeec9b9de..0d67e3d70f945 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -105,6 +105,12 @@ isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
<< " for tensor descriptor " << tdescTy;
}
+// Checks if the given shape is evenly distributed based on the layout
+// and data factors provided by the LayoutAttr. The function ensures that
+// each dimension of the shape can be evenly divided by the corresponding
+// data factor, and the resulting quotient can be evenly divided by the
+// layout factor. Returns `true` if the shape is evenly distributed,
+// otherwise `false`.
static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
xegpu::LayoutAttr attr) {
assert(attr && "Layout attribute is missing.");
@@ -121,7 +127,6 @@ static bool isEvenDistributed(llvm::ArrayRef<int64_t> shape,
}
for (auto [dimSize, dataFactor, layoutFactor] :
llvm::zip_equal(shape, data, layout)) {
- // check dimSize % (dataFactor * layoutFactor) != 0
if (dimSize % dataFactor != 0 || (dimSize / dataFactor) % layoutFactor != 0)
return false;
}
>From 2a1d373a61ca10bca9064a2afa7ac1fb88a87fc8 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 10 Apr 2025 18:45:30 +0000
Subject: [PATCH 40/53] Switch to 1D representation for SIMT
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 17 +-
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 3 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 26 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 227 +++++++++++-------
mlir/test/Dialect/XeGPU/invalid.mlir | 100 ++------
mlir/test/Dialect/XeGPU/ops.mlir | 162 ++++++-------
6 files changed, 250 insertions(+), 285 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 16a7f63d60c82..9af6eaf69aec3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -833,16 +833,14 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
also requires A and B to be loaded with the required data layout. Specially,
-
VNNI layout is required for B operand. It is achieved via adding `packed`
attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
can be represented as a 3D vector, with the last dimension representing the VNNI
factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
can be represented as `B: vector<8x16x2xf16>`.
- In SIMT mode, DpasOp expects layout attributes `a`, `b`, and `c` (only if acc is used)
- which describe the data fragment owned by each work-item w.r.t. the tensor descriptor
- these data are loaded from.
+ In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
+ which are represented as 1D vectors.
Note: on PVC, the hardware can perform load with VNNI transformation when data
element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -850,13 +848,10 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
}];
let arguments = (ins
- XeGPU_DpasOpType : $lhs,
- XeGPU_DpasOpType : $rhs,
- Optional<XeGPU_Vector2DType>: $acc,
- OptionalAttr<XeGPU_LayoutAttr>:$a_layout,
- OptionalAttr<XeGPU_LayoutAttr>:$b_layout,
- OptionalAttr<XeGPU_LayoutAttr>:$c_layout);
- let results = (outs XeGPU_Vector2DType: $result);
+ XeGPU_DpasOprType : $lhs,
+ XeGPU_DpasOprType : $rhs,
+ Optional<XeGPU_DpasResType>: $acc);
+ let results = (outs XeGPU_DpasResType: $result);
let extraClassDeclaration = [{
VectorType getLhsType() {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 173f1462fdd73..3cb71788a15ef 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -17,7 +17,8 @@ def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64,
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>;
-def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
+def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
+def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 171a15ce27b59..269e445c3790c 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -10,6 +10,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/DialectImplementation.h"
#include "llvm/ADT/TypeSwitch.h"
+#include <numeric>
namespace mlir {
namespace xegpu {
@@ -336,19 +337,20 @@ LogicalResult TensorDescType::verify(
// [n_distribution_units, lane_data_size]
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
- // If no layout is provided, tensor desc is not used in SIMT mode.
- if (!layout)
+ // It only works for subgroup level layout, which only has lane_layout
+ // and lane_data, and is to distribute a SIMD code into SIMT code.
+ if (!layout || !layout.isSgLayout())
return failure();
SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
auto tdescShape = getShape();
- auto laneDataSize = 1, sgSize = 1;
- for (auto [laneDim, laneDataDim] : llvm::zip_equal(laneLayout, laneData)) {
- laneDataSize *= laneDataDim;
- sgSize *= laneDim;
- }
+ // compute sgSize by multiply elements of laneLayout
+ // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
+ // e.g. for 1D layout, sgSize = laneLayout[0]
+ auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
+ std::multiplies<int64_t>());
// Case 1: regular loads/stores
auto scatterAttr = getEncodingAsScatterTensorDescAttr();
@@ -356,12 +358,9 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto chunkSize = scatterAttr.getChunkSize().getInt();
// Verify if the first dimension of the tensor descriptor shape is
// distributable.
- assert(tdescShape[0] % (laneLayout[0]) == 0 &&
+ assert(tdescShape[0] == laneLayout[0] &&
"tensor descriptor shape is not distributable");
- if (chunkSize > 1)
- return VectorType::get({chunkSize / laneDataSize, laneDataSize},
- getElementType());
- return VectorType::get({laneDataSize}, getElementType());
+ return VectorType::get({chunkSize}, getElementType());
}
// Case 2: block loads/stores
@@ -376,8 +375,7 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
// tensorSize must be adjusted for array_length.
tensorSize *= getArrayLength();
- return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
- getElementType());
+ return VectorType::get({tensorSize / sgSize}, getElementType());
}
} // namespace xegpu
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0d67e3d70f945..fef39508c3bfe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -73,38 +73,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
}
-// Helper to validate value shape of LoadNd and StoreNd ops.
-static LogicalResult
-isArgShapesValid(TensorDescType tdescTy, VectorType valueTy,
- ArrayRef<int64_t> adjustedTdescShape,
- function_ref<InFlightDiagnostic()> emitError) {
- auto layout = tdescTy.getLayoutAttr();
- auto valueShape = valueTy.getShape();
- // layout not present means IR is in SIMD mode. In this case value shape must
- // match adjusted tensor descriptor shape.
- if (!layout)
- return valueShape == adjustedTdescShape
- ? success()
- : emitError()
- << "Value shape " << makeString(valueShape)
- << " is not consistent with tensor descriptor " << tdescTy;
-
- // layout present means IR is in SIMT mode. In this case layout determines the
- // value shape.
- auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
- assert(succeeded(expectedValueShapeOrFailure) &&
- "Failed to compute distributed vector shape for "
- "tensor descriptor ");
-
- return valueTy == expectedValueShapeOrFailure.value()
- ? success()
- : emitError()
- << "Result shape " << makeString(valueShape)
- << " is not consistent with distributed vector shape "
- << makeString(expectedValueShapeOrFailure.value().getShape())
- << " for tensor descriptor " << tdescTy;
-}
-
// Checks if the given shape is evenly distributed based on the layout
// and data factors provided by the LayoutAttr. The function ensures that
// each dimension of the shape can be evenly divided by the corresponding
@@ -302,9 +270,35 @@ LogicalResult LoadNdOp::verify() {
if (!isReadHintOrNone(getL3HintAttr()))
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
+ // Handling a 1D vector as the result can be complex. It may represent the
+ // outcome of a 1D block load in SIMD mode or a fragment of a block load
+ // result in SIMT mode. In the latter case, the tensor descriptor must be
+ // evenly distributed, with each lane holding an equally sized fragment of
+ // the result. Only subgroup size 8 or 16 is supported.
+ if (valueTy.getRank() == 1 &&
+ valueTy.getNumElements() < tdescTy.getNumElements()) {
+ // SIMT mode doesn't need LayoutAttr.
+ if (tdescTy.getLayoutAttr())
+ return emitOpError()
+ << "TensorDesc doesn't need LayoutAttr for SIMT code";
+
+ int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength();
+ int valueElems = valueTy.getNumElements();
+
+ int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
+ if (lanes != 16 && lanes != 8) {
+ return emitOpError()
+ << "Result shape " << makeString(getShapeOf(valueTy))
+ << " is not a valid distribution for tensor descriptor "
+ << tdescTy;
+ }
+ return success();
+ }
+
+ // Check SIMD mode.
auto array_len = tdescTy.getArrayLength();
// adjusted tensor descriptor shape tracks the expected shape of the result.
- auto adjustedTdescShape = getShapeOf(tdescTy);
+ auto tdescShape = getShapeOf(tdescTy);
auto valueShape = getShapeOf(valueTy);
if (getTranspose()) {
@@ -316,7 +310,7 @@ LogicalResult LoadNdOp::verify() {
});
if (valid)
- transpose(trans, adjustedTdescShape);
+ transpose(trans, tdescShape);
else
mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
}
@@ -325,8 +319,8 @@ LogicalResult LoadNdOp::verify() {
if (tdescTy.getRank() == 2) {
const int axis = 0;
auto vnni_factor = valueShape.back();
- adjustedTdescShape[axis] /= vnni_factor;
- adjustedTdescShape.push_back(vnni_factor);
+ tdescShape[axis] /= vnni_factor;
+ tdescShape.push_back(vnni_factor);
} else {
mlir::emitWarning(getLoc())
<< "Invalid Packed Attr. It is ignored (available for 2D "
@@ -335,12 +329,16 @@ LogicalResult LoadNdOp::verify() {
}
if (array_len > 1) {
- auto it = adjustedTdescShape.begin();
- adjustedTdescShape.insert(it, array_len);
+ tdescShape.insert(tdescShape.begin(), array_len);
+ }
+
+ if (tdescShape != valueShape) {
+ return emitOpError() << "Result shape " << makeString(valueShape)
+ << " is not consistent with tensor descriptor "
+ << tdescTy;
}
- return isArgShapesValid(tdescTy, valueTy, adjustedTdescShape,
- [&]() { return emitOpError(); });
+ return success();
}
//===----------------------------------------------------------------------===//
@@ -371,8 +369,37 @@ LogicalResult StoreNdOp::verify() {
auto tdescShape = getShapeOf(dstTy);
auto valueShape = getShapeOf(valTy);
- return isArgShapesValid(dstTy, valTy, tdescShape,
- [&]() { return emitOpError(); });
+ // Similar to LoadNdOp, handling a 1D vector as the value can be complex. It
+ // may represent the input of a 1D block store in SIMD mode or a fragment of
+ // a block store input in SIMT mode. In the latter case, the tensor descriptor
+ // must be evenly distributed, with each lane holding an equally sized
+ // fragment of the input. Only subgroup size 8 or 16 is supported.
+ if (valTy.getRank() == 1 && valTy.getNumElements() < dstTy.getNumElements()) {
+ // SIMT mode doesn't need LayoutAttr.
+ if (dstTy.getLayoutAttr())
+ return emitOpError()
+ << "TensorDesc doesn't need LayoutAttr for SIMT code";
+
+ int tdescElems = dstTy.getNumElements() * dstTy.getArrayLength();
+ int valueElems = valueShape[0];
+
+ int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
+ if (lanes != 16 && lanes != 8) {
+ return emitOpError()
+ << "Value shape " << makeString(getShapeOf(valTy))
+ << " is not a valid distribution for tensor descriptor " << dstTy;
+ }
+ return success();
+ }
+
+ // SIMD code should have the same shape as the tensor descriptor.
+ if (tdescShape != valueShape) {
+ return emitOpError() << "Value shape " << makeString(valueShape)
+ << " is not consistent with tensor descriptor "
+ << dstTy;
+ }
+
+ return success();
}
//===----------------------------------------------------------------------===//
@@ -520,14 +547,41 @@ LogicalResult LoadGatherOp::verify() {
if (tdescShape[0] != maskShape[0])
return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
+ auto chunkSize = tdescTy.getChunkSize();
+ // for SIMT code, the value should be 1D vector with size of chunkSize.
+ if (valueTy.getRank() == 1 && valueTy.getNumElements() != tdescShape[0]) {
+ if (valueTy.getNumElements() != chunkSize) {
+ return emitOpError()
+ << "Result shape " << makeString(valueShape)
+ << " is not a valid distribution for tensor descriptor "
+ << tdescTy;
+ } else { // valid SIMT code doesn't need LayoutAttr and TransposeAttr.
+ if (tdescTy.getLayoutAttr())
+ return emitOpError()
+ << "TensorDesc doesn't need LayoutAttr for SIMT code";
+ if (getTransposeAttr())
+ return emitOpError() << "doesn't need TransposeAttr for SIMT code";
+ }
+ return success();
+ } else if (valueTy.getRank() == 1 && tdescShape[0] == chunkSize) {
+ // for 1D vector and valueTy.getNumElements() == tdescShape[0] case,
+ // it is a valid SIMT code if chunkSize happens to be the same as
+ // subgroup size, e.g., tensor_desc<16x16xf16, chunkSize = 16>
+ return success();
+ }
+
+ // For SIMD code verification.
if (tdescTy.getRank() == 2) {
if (!getTransposeAttr())
return emitOpError("load of rank-2 tensor has to be transposed.");
transpose({1, 0}, tdescShape);
}
- return isArgShapesValid(tdescTy, valueTy, tdescShape,
- [&]() { return emitOpError(); });
+ if (tdescShape != valueShape)
+ return emitOpError() << "Result shape " << makeString(valueShape)
+ << " is not consistent with tensor descriptor "
+ << tdescTy;
+ return success();
}
//===----------------------------------------------------------------------===//
@@ -559,14 +613,42 @@ LogicalResult StoreScatterOp::verify() {
if (tdescShape[0] != maskShape[0])
return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
+ auto chunkSize = tdescTy.getChunkSize();
+ // for SIMT code, the value should be 1D vector with size of chunkSize.
+ if (valueTy.getRank() == 1 && valueTy.getNumElements() != tdescShape[0]) {
+ if (valueTy.getNumElements() != chunkSize) {
+ return emitOpError()
+ << "Value shape " << makeString(valueShape)
+ << " is not a valid distribution for tensor descriptor "
+ << tdescTy;
+ } else { // valid SIMT code doesn't need LayoutAttr and TransposeAttr.
+ if (tdescTy.getLayoutAttr())
+ return emitOpError()
+ << "TensorDesc doesn't need LayoutAttr for SIMT code";
+ if (getTransposeAttr())
+ return emitOpError() << "doesn't need TransposeAttr for SIMT code";
+ }
+ return success();
+ } else if (valueTy.getRank() == 1 && tdescShape[0] == chunkSize) {
+ // for 1D vector and valueTy.getNumElements() == tdescShape[0] case,
+ // it is a valid SIMT code if chunkSize happens to be the same as
+ // subgroup size, e.g., tensor_desc<16x16xf16, chunkSize = 16>
+ return success();
+ }
+
+ // for SIMD code verification.
if (tdescTy.getRank() == 2) {
if (!getTransposeAttr())
return emitOpError("Store of a rank-2 tensor has to be transposed.");
transpose({1, 0}, tdescShape);
}
- return isArgShapesValid(tdescTy, valueTy, tdescShape,
- [&]() { return emitOpError(); });
+ if (tdescShape != valueShape)
+ return emitOpError() << "Value shape " << makeString(valueShape)
+ << " is not consistent with tensor descriptor "
+ << tdescTy;
+
+ return success();
}
//===----------------------------------------------------------------------===//
@@ -602,51 +684,16 @@ LogicalResult DpasOp::verify() {
auto rhsShape = getRhsType().getShape();
auto resShape = getResultType().getShape();
- auto aLayout = getALayoutAttr();
- auto bLayout = getBLayoutAttr();
- auto cLayout = getCLayoutAttr();
-
- // make sure the layout attribute is either set for every available
- // operand or simply not set at all. C is special, since ACC is optional.
- auto hasValidLayoutAttrs = [&]() {
- bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
- if (hasAcc()) {
- result |= (aLayout != nullptr) ^ (cLayout != nullptr);
- }
- return !result;
- };
+ if (getAcc()) {
+ if (getAcc().getType() != getResultType())
+ return emitOpError("Expecting the acc type to be the same as result.");
+ }
- if (!hasValidLayoutAttrs())
- return emitOpError(
- "layout attributes should be either set for all operands (for SIMT "
- "code) or not set at all (for SIMD code).");
-
- // query the scope from aLayout (a valid setting).
- if (aLayout) {
- // In SIMT mode, All data fragments must be 2D
- if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
- return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
- auto laneLayoutA = aLayout.getLaneLayout();
- auto laneLayoutB = bLayout.getLaneLayout();
- auto laneLayoutC = cLayout.getLaneLayout();
- // Obtain the expanded shapes of the operands and result using lane_layout.
- // NOTE: For B, get rid of the packed dimension for the expanded shape.
- SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
- lhsShape[1] * laneLayoutA[1]};
- SmallVector<int64_t> expandedShapeB = {
- rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
- SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
- resShape[1] * laneLayoutC[1]};
- auto bK = expandedShapeB[0];
- if (bK != expandedShapeA[1])
- return emitOpError("K-dimension mismatch.");
- if (expandedShapeA[0] != expandedShapeC[0])
- return emitOpError("M-dimension mismatch.");
- if (expandedShapeB[1] != expandedShapeC[1])
- return emitOpError("N-dimension mismatch.");
- } else { // For other scopes, operands' shape should match the mxkxn
- // semantics.
+ // SIMT code: skip the check since lack of semantic info at this level.
+ // Users need to ensure the correctness.
+ if (lhsRank == 1 && rhsRank == 1 && resRank == 1) {
+ return success();
+ } else { // SIMD code
if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
return emitOpError(
"expecting lhs and result to be a 2D vector, and rhs to be either "
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 48df33a591908..c0739d735dfec 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -79,25 +79,10 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
// -----
func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1]}}
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- -> vector<8x2xf32>
- return
-}
-
-// -----
-func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- // expected-error at +1 {{Result shape [8] is not consistent with distributed vector shape [1, 1]}}
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
+ // expected-error at +1 {{Result shape [8] is not a valid distribution for tensor descriptor}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- -> vector<8xf32>
+ l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<8xf32>
return
}
@@ -105,7 +90,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
func.func @test_load_nd_vc_6(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
!xegpu.tensor_desc<8x16xf32>
- // expected-error at +1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
+ // expected-error at +1 {{Result shape [8, 1] is not consistent with tensor descriptor}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32> -> vector<8x1xf32>
@@ -134,22 +119,10 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
}
// -----
-func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) {
- %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // expected-error at +1 {{Result shape [8, 2] is not consistent with distributed vector shape [8, 1] for tensor descriptor}}
- xegpu.store_nd %data, %1
- : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- return
-}
-
-// -----
-func.func @test_store_nd_layout(%dst: memref<24x32xf32>, %data: vector<2xf32>) {
- %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- // expected-error at +1 {{Result shape [2] is not consistent with distributed vector shape [1, 1] for tensor descriptor}}
- xegpu.store_nd %data, %1
- : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<4xf32>) {
+ %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
+ // expected-error at +1 {{Value shape [4] is not a valid distribution for tensor descriptor}}
+ xegpu.store_nd %data, %1 : vector<4xf32>, !xegpu.tensor_desc<16xf32>
return
}
@@ -269,45 +242,23 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
}
// -----
-func.func @test_load_gather_layout_1(%src: ui64) {
+func.func @test_load_gather_simt_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<1x2xf32>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ // expected-error at +1 {{Result shape [6] is not a valid distribution for tensor descriptor}}
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<6xf32>
return
}
// -----
-func.func @test_load_gather_layout_2(%src: ui64) {
+func.func @test_store_scatter_simt_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2xf32>
- return
-}
-
-
-// -----
-func.func @test_store_scatter_layout_1(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %val = arith.constant dense<2.9>: vector<1x2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- // expected-error at +1 {{Result shape [1, 2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<1x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
- return
-}
-
-// -----
-func.func @test_store_scatter_layout_2(%src: ui64) {
- %0 = arith.constant dense<1>: vector<4xi1>
- %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
- %val = arith.constant dense<2.9>: vector<2xf32>
- %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- // expected-error at +1 {{esult shape [2] is not consistent with distributed vector shape [2, 1] for tensor descriptor}}
- xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, transpose}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ %val = arith.constant dense<2.9>: vector<6xf32>
+ %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ // expected-error at +1 {{Value shape [6] is not a valid distribution for tensor descriptor}}
+ xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
return
}
@@ -393,23 +344,6 @@ func.func @test_dpas_4(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
return
}
-// -----
-func.func @test_dpas_layout_1(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // expected-error at +1 {{layout attributes should be either set for all operands (for SIMT code) or not set at all (for SIMD code)}}
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
- return
-}
-
-// -----
-func.func @test_dpas_layout_2(%a : vector<8x1xf16>, %b: vector<4x2xf16>) {
- // expected-error at +1 {{K-dimension mismatch}}
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<8x1xf16>, vector<4x2xf16> -> vector<8x1xf32>
- return
-}
-
// -----
func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
%0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index e9895e0d0a71d..71e7e9bdda07d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -125,11 +125,11 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
%2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<4x2xf16>
+ : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
gpu.return
}
@@ -144,10 +144,10 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<1x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
gpu.return
}
@@ -162,11 +162,10 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
gpu.return
}
@@ -181,11 +180,10 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<8x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
gpu.return
}
@@ -200,11 +198,10 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
- !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<2x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
gpu.return
}
@@ -219,11 +216,11 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x1xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<32xf16>
gpu.return
}
@@ -238,11 +235,11 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
- !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x2xf16>
+ !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>> -> vector<32xf16>
gpu.return
}
@@ -257,10 +254,10 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
- %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
- // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
- %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<8x1xf32>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+ // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
+ %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
gpu.return
}
@@ -277,13 +274,12 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
// CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
- // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48x1xf16>
- %1 = arith.constant dense<1.0>: vector<48x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48x1xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16>
+ %1 = arith.constant dense<1.0>: vector<48xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16>
gpu.return
}
@@ -303,13 +299,12 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
// CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) {
gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
- // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2x1xf16>
- %1 = arith.constant dense<1.0>: vector<2x1xf16>
- // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> ->
- !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2x1xf16>, !xegpu.tensor_desc<32xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
+ %1 = arith.constant dense<1.0>: vector<2xf16>
+ // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
+ // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<32xf16>
+ xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<2xf16>, !xegpu.tensor_desc<32xf16>
gpu.return
}
@@ -425,10 +420,10 @@ gpu.func @test_load_simt(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1> -> vector<2x1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2xf32>
gpu.return
}
@@ -451,10 +446,10 @@ gpu.func @test_load_simt_2(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1> -> vector<1xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32>
gpu.return
}
@@ -477,10 +472,10 @@ gpu.func @test_load_simt_3(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
- %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
- //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
- %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1> -> vector<4x2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+ %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
+ //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<4xi1> -> vector<8xf16>
+ %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<4xi1> -> vector<8xf16>
gpu.return
}
@@ -507,12 +502,12 @@ gpu.func @test_store_simt(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2x1xf32>
- %2 = arith.constant dense<2.9>: vector<2x1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2xf32>
+ %2 = arith.constant dense<2.9>: vector<2xf32>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
gpu.return
}
@@ -539,12 +534,12 @@ gpu.func @test_store_simt_2(%src: ui64) {
%0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
//CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
%1 = arith.constant dense<1>: vector<4xi1>
- //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<1x2xf16>
- %2 = arith.constant dense<2.9>: vector<1x2xf16>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<1x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>, vector<4xi1>
+ //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2xf16>
+ %2 = arith.constant dense<2.9>: vector<2xf16>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
gpu.return
}
@@ -572,10 +567,10 @@ gpu.func @test_store_simt_3(%src: ui64) {
%1 = arith.constant dense<1>: vector<4xi1>
//CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
%2 = arith.constant dense<2.9>: vector<1xf32>
- //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
- %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
- //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
- xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>, vector<4xi1>
+ //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>
+ //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
+ xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1>
gpu.return
}
@@ -635,15 +630,10 @@ gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
gpu.return
}
-// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8x1xf16>, %[[arg1:.*]]: vector<8x2xf16>)
-gpu.func @test_dpas_simt(%a : vector<8x1xf16>, %b: vector<8x2xf16>) {
- // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- // CHECK: b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- // CHECK: c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
- %1 = xegpu.dpas %a, %b {a_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- b_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- c_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<8x1xf16>, vector<8x2xf16> -> vector<8x1xf32>
+// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
+gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) {
+ // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+ %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32>
gpu.return
}
>From 2159119977dfb62c11d808777529dd34ed0abd43 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Thu, 10 Apr 2025 20:25:00 +0000
Subject: [PATCH 41/53] refine verfier for load_nd and store_nd
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 4 +-
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 53 +++++++++----------
mlir/test/Dialect/XeGPU/invalid.mlir | 19 +++++--
3 files changed, 43 insertions(+), 33 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 9af6eaf69aec3..5fa18754305ca 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -840,7 +840,9 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
can be represented as `B: vector<8x16x2xf16>`.
In SIMT code, each work-item from a subgroup holds a data fragment for A, B, C and the result,
- which are represented as 1D vectors.
+ which are represented as 1D vectors. Please refer to [OpenCL Intel extentions]
+ (https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html)
+ for more details about the fragment distribution.
Note: on PVC, the hardware can perform load with VNNI transformation when data
element type is 16-bit or lower precision, taking 2 or 4 elements from
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index fef39508c3bfe..1dafc9936107e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -270,33 +270,31 @@ LogicalResult LoadNdOp::verify() {
if (!isReadHintOrNone(getL3HintAttr()))
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
- // Handling a 1D vector as the result can be complex. It may represent the
- // outcome of a 1D block load in SIMD mode or a fragment of a block load
- // result in SIMT mode. In the latter case, the tensor descriptor must be
- // evenly distributed, with each lane holding an equally sized fragment of
- // the result. Only subgroup size 8 or 16 is supported.
- if (valueTy.getRank() == 1 &&
- valueTy.getNumElements() < tdescTy.getNumElements()) {
+ int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength();
+ int valueElems = valueTy.getNumElements();
+
+ // If the result vector is 1D and has less elements than the tensor
+ // descriptor, it is supposed to be a SIMT op. The layout attribute in
+ // tensor_desc is not needed.
+ if (valueElems < tdescElems && valueTy.getRank() == 1) {
// SIMT mode doesn't need LayoutAttr.
if (tdescTy.getLayoutAttr())
return emitOpError()
<< "TensorDesc doesn't need LayoutAttr for SIMT code";
- int tdescElems = tdescTy.getNumElements() * tdescTy.getArrayLength();
- int valueElems = valueTy.getNumElements();
-
- int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
- if (lanes != 16 && lanes != 8) {
+ // For SIMT code, the load is evenly distributed across all lanes in a
+ // subgroup. Since subgroup size is arch dependent, we only check even
+ // distribution here.
+ if (tdescElems % valueElems)
return emitOpError()
<< "Result shape " << makeString(getShapeOf(valueTy))
<< " is not a valid distribution for tensor descriptor "
<< tdescTy;
- }
+
return success();
}
// Check SIMD mode.
- auto array_len = tdescTy.getArrayLength();
// adjusted tensor descriptor shape tracks the expected shape of the result.
auto tdescShape = getShapeOf(tdescTy);
auto valueShape = getShapeOf(valueTy);
@@ -328,6 +326,7 @@ LogicalResult LoadNdOp::verify() {
}
}
+ auto array_len = tdescTy.getArrayLength();
if (array_len > 1) {
tdescShape.insert(tdescShape.begin(), array_len);
}
@@ -366,25 +365,23 @@ LogicalResult StoreNdOp::verify() {
if (!isWriteHintOrNone(getL3HintAttr()))
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
- auto tdescShape = getShapeOf(dstTy);
- auto valueShape = getShapeOf(valTy);
+ auto array_len = dstTy.getArrayLength();
+ if (array_len > 1)
+ return emitOpError("array length is not supported by store_nd.\n");
+
+ auto tdescElems = dstTy.getNumElements();
+ auto valueElems = valTy.getNumElements();
- // Similar to LoadNdOp, handling a 1D vector as the value can be complex. It
- // may represent the input of a 1D block store in SIMD mode or a fragment of
- // a block store input in SIMT mode. In the latter case, the tensor descriptor
- // must be evenly distributed, with each lane holding an equally sized
- // fragment of the input. Only subgroup size 8 or 16 is supported.
- if (valTy.getRank() == 1 && valTy.getNumElements() < dstTy.getNumElements()) {
+ // Similar to LoadNdOp, if the value vector is 1D and has less elements than
+ // the tensor descriptor, it is supposed to be a SIMT op. The layout attribute
+ // in tensor_desc is not needed.
+ if (valTy.getRank() == 1 && valueElems < tdescElems) {
// SIMT mode doesn't need LayoutAttr.
if (dstTy.getLayoutAttr())
return emitOpError()
<< "TensorDesc doesn't need LayoutAttr for SIMT code";
- int tdescElems = dstTy.getNumElements() * dstTy.getArrayLength();
- int valueElems = valueShape[0];
-
- int lanes = tdescElems % valueElems == 0 ? tdescElems / valueElems : -1;
- if (lanes != 16 && lanes != 8) {
+ if (tdescElems % valueElems) {
return emitOpError()
<< "Value shape " << makeString(getShapeOf(valTy))
<< " is not a valid distribution for tensor descriptor " << dstTy;
@@ -393,6 +390,8 @@ LogicalResult StoreNdOp::verify() {
}
// SIMD code should have the same shape as the tensor descriptor.
+ auto tdescShape = getShapeOf(dstTy);
+ auto valueShape = getShapeOf(valTy);
if (tdescShape != valueShape) {
return emitOpError() << "Value shape " << makeString(valueShape)
<< " is not consistent with tensor descriptor "
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index c0739d735dfec..a02427b6e317b 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -80,9 +80,9 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
// -----
func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
%1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
- // expected-error at +1 {{Result shape [8] is not a valid distribution for tensor descriptor}}
+ // expected-error at +1 {{Result shape [3] is not a valid distribution for tensor descriptor}}
%2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<8xf32>
+ l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf32> -> vector<3xf32>
return
}
@@ -119,10 +119,19 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
}
// -----
-func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<4xf32>) {
+func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
+ %1 = arith.constant dense<1.0>: vector<2x24x32xf16>
+ %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ // expected-error at +1 {{array length is not supported by store_nd}}
+ xegpu.store_nd %1, %2: vector<2x24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
+ return
+}
+
+// -----
+func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
%1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
- // expected-error at +1 {{Value shape [4] is not a valid distribution for tensor descriptor}}
- xegpu.store_nd %data, %1 : vector<4xf32>, !xegpu.tensor_desc<16xf32>
+ // expected-error at +1 {{Value shape [3] is not a valid distribution for tensor descriptor}}
+ xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32>
return
}
>From 21f50c09992cf9ef629ab02036d2b4be273113e1 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 10 Apr 2025 20:31:43 +0000
Subject: [PATCH 42/53] fix issues
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 ++
.../Transforms/XeGPUSubgroupDistribute.cpp | 52 +++++++++----------
2 files changed, 29 insertions(+), 27 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 254568e00dfcb..53372a23a2182 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -378,6 +378,10 @@ FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
// tensorSize must be adjusted for array_length.
tensorSize *= getArrayLength();
+ if (layout.getRank() == 1) {
+ return VectorType::get({tensorSize / sgSize}, getElementType());
+ }
+
return VectorType::get({tensorSize / (sgSize * laneDataSize), laneDataSize},
getElementType());
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 5b812a731ec95..ed9418696c69b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -36,6 +36,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/ADT/bit.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
@@ -781,30 +782,27 @@ namespace {
/// | 2x32x16 | [1, 16] | 2x32x1 |
FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
VectorType originalType) {
- llvm::SmallVector<int64_t, 2> distributedShape;
if (!layout)
return failure();
- auto laneLayout = layout.getLaneLayout();
- assert((originalType.getRank() == 2 || originalType.getRank() == 3) &&
- "expecting 2D or 3D shape for the original vector type");
- assert(laneLayout.size() == 2 && "expecting 2D shape for the wi layout");
- // Original type can be 2D or 3D (array_length > 1), the last two dims are the
- // block shape.
- auto blockShape = originalType.getShape().take_back(2);
- // Check if the block vector shape can be distributed evenly.
- if (blockShape[0] % laneLayout[0] != 0 || blockShape[1] % laneLayout[1] != 0)
- return failure();
-
- if (originalType.getRank() == 3) {
- distributedShape.push_back(originalType.getShape()[0]);
- }
- for (unsigned i = 0; i < 2; ++i) {
- distributedShape.push_back(blockShape[i] / laneLayout[i]);
+ auto laneLayout = layout.getLaneLayout().asArrayRef();
+ assert(originalType.getShape().size() >= laneLayout.size() &&
+ "Rank of the original vector type should be greater or equal to the "
+ "size of the lane layout to distribute the vector type.");
+ SmallVector<int64_t> distributedShape(originalType.getShape());
+ /// Only distribute the last `laneLayout.size()` dimensions. The remaining
+ /// dimensions are not distributed.
+ unsigned distributionStart = originalType.getRank() - laneLayout.size();
+ for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+ if (i < distributionStart) {
+ continue;
+ }
+ /// Check if the dimension can be distributed evenly.
+ if (dim % laneLayout[i - distributionStart] != 0)
+ return failure();
+ distributedShape[i] = dim / laneLayout[i - distributionStart];
}
- auto newVectorType =
- VectorType::get(distributedShape, originalType.getElementType());
- return newVectorType;
+ return VectorType::get(distributedShape, originalType.getElementType());
}
static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
@@ -1028,15 +1026,14 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
return rewriter.notifyMatchFailure(
storeOp, "the source tensor descriptor lacks sg_map attribute");
- if (storeOp.getTensorDescType().getShape().size() != 2)
- return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-
- auto distriburtedTypeByWarpOp =
+ auto distributedTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
- if (failed(distriburtedTypeByWarpOp))
+ if (failed(distributedTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
- VectorType distributedTypeByWarpOp = distriburtedTypeByWarpOp.value();
+ VectorType distributedTypeByWarpOp =
+ distributedTypeByWarpOpOrFailure.value();
+ llvm::errs() << "distributed type: " << distributedTypeByWarpOp << "\n";
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -1066,7 +1063,8 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
- newStoreOperands, storeOp->getAttrs());
+ newStoreOperands);
+ storeOp->setDialectAttrs(storeOp->getDialectAttrs());
rewriter.eraseOp(storeOp);
return success();
}
>From c81b2e05e6a5fde3c314d557d7993f8b73cf66cd Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 10 Apr 2025 22:18:05 +0000
Subject: [PATCH 43/53] fix issues
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 130 +++++++++---------
.../Transforms/XeGPUSubgroupDistribute.cpp | 91 +++++++-----
.../Dialect/XeGPU/subgroup-distribution.mlir | 70 ++++++++++
3 files changed, 192 insertions(+), 99 deletions(-)
create mode 100644 mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index cb5b87d233595..d563bce6b9c9b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -597,70 +597,72 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
// XeGPU_DpasOp
//===----------------------------------------------------------------------===//
LogicalResult DpasOp::verify() {
- int64_t lhsRank = getLhsType().getRank();
- int64_t rhsRank = getRhsType().getRank();
- int64_t resRank = getResultType().getRank();
- auto lhsShape = getLhsType().getShape();
- auto rhsShape = getRhsType().getShape();
- auto resShape = getResultType().getShape();
-
- auto aLayout = getALayoutAttr();
- auto bLayout = getBLayoutAttr();
- auto cLayout = getCLayoutAttr();
-
- // make sure the layout attribute is either set for every available
- // operand or simply not set at all. C is special, since ACC is optional.
- auto hasValidLayoutAttrs = [&]() {
- bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
- if (hasAcc()) {
- result |= (aLayout != nullptr) ^ (cLayout != nullptr);
- }
- return !result;
- };
-
- if (!hasValidLayoutAttrs())
- return emitOpError(
- "layout attributes should be either set for all operands (for SIMT "
- "code) or not set at all (for SIMD code).");
-
- // query the scope from aLayout (a valid setting).
- if (aLayout) {
- // In SIMT mode, All data fragments must be 2D
- if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
- return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
-
- auto laneLayoutA = aLayout.getLaneLayout();
- auto laneLayoutB = bLayout.getLaneLayout();
- auto laneLayoutC = cLayout.getLaneLayout();
- // Obtain the expanded shapes of the operands and result using lane_layout.
- // NOTE: For B, get rid of the packed dimension for the expanded shape.
- SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
- lhsShape[1] * laneLayoutA[1]};
- SmallVector<int64_t> expandedShapeB = {
- rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
- SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
- resShape[1] * laneLayoutC[1]};
- auto bK = expandedShapeB[0];
- if (bK != expandedShapeA[1])
- return emitOpError("K-dimension mismatch.");
- if (expandedShapeA[0] != expandedShapeC[0])
- return emitOpError("M-dimension mismatch.");
- if (expandedShapeB[1] != expandedShapeC[1])
- return emitOpError("N-dimension mismatch.");
- } else { // For other scopes, operands' shape should match the mxkxn
- // semantics.
- if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
- return emitOpError(
- "expecting lhs and result to be a 2D vector, and rhs to be either "
- "2D or 3D (packed) vector.");
- auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
- if (bK != lhsShape[1])
- return emitOpError("K-dimension mismatch.");
- if (lhsShape[0] != resShape[0])
- return emitOpError("M-dimension mismatch.");
- if (rhsShape[1] != resShape[1])
- return emitOpError("N-dimension mismatch.");
- }
+ // int64_t lhsRank = getLhsType().getRank();
+ // int64_t rhsRank = getRhsType().getRank();
+ // int64_t resRank = getResultType().getRank();
+ // auto lhsShape = getLhsType().getShape();
+ // auto rhsShape = getRhsType().getShape();
+ // auto resShape = getResultType().getShape();
+
+ // auto aLayout = getALayoutAttr();
+ // auto bLayout = getBLayoutAttr();
+ // auto cLayout = getCLayoutAttr();
+
+ // // make sure the layout attribute is either set for every available
+ // // operand or simply not set at all. C is special, since ACC is optional.
+ // auto hasValidLayoutAttrs = [&]() {
+ // bool result = (aLayout != nullptr) ^ (bLayout != nullptr);
+ // if (hasAcc()) {
+ // result |= (aLayout != nullptr) ^ (cLayout != nullptr);
+ // }
+ // return !result;
+ // };
+
+ // if (!hasValidLayoutAttrs())
+ // return emitOpError(
+ // "layout attributes should be either set for all operands (for SIMT "
+ // "code) or not set at all (for SIMD code).");
+
+ // // query the scope from aLayout (a valid setting).
+ // if (aLayout) {
+ // // In SIMT mode, All data fragments must be 2D
+ // if (lhsRank != 2 || rhsRank != 2 || resRank != 2)
+ // return emitOpError("expecting lhs, rhs, and result to be a 2D
+ // vector.");
+
+ // auto laneLayoutA = aLayout.getLaneLayout();
+ // auto laneLayoutB = bLayout.getLaneLayout();
+ // auto laneLayoutC = cLayout.getLaneLayout();
+ // // Obtain the expanded shapes of the operands and result using
+ // lane_layout.
+ // // NOTE: For B, get rid of the packed dimension for the expanded shape.
+ // SmallVector<int64_t> expandedShapeA = {lhsShape[0] * laneLayoutA[0],
+ // lhsShape[1] * laneLayoutA[1]};
+ // SmallVector<int64_t> expandedShapeB = {
+ // rhsShape[0] * rhsShape[1] * laneLayoutB[0], 1 * laneLayoutB[1]};
+ // SmallVector<int64_t> expandedShapeC = {resShape[0] * laneLayoutC[0],
+ // resShape[1] * laneLayoutC[1]};
+ // auto bK = expandedShapeB[0];
+ // if (bK != expandedShapeA[1])
+ // return emitOpError("K-dimension mismatch.");
+ // if (expandedShapeA[0] != expandedShapeC[0])
+ // return emitOpError("M-dimension mismatch.");
+ // if (expandedShapeB[1] != expandedShapeC[1])
+ // return emitOpError("N-dimension mismatch.");
+ // } else { // For other scopes, operands' shape should match the mxkxn
+ // // semantics.
+ // if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resRank != 2)
+ // return emitOpError(
+ // "expecting lhs and result to be a 2D vector, and rhs to be either "
+ // "2D or 3D (packed) vector.");
+ // auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+ // if (bK != lhsShape[1])
+ // return emitOpError("K-dimension mismatch.");
+ // if (lhsShape[0] != resShape[0])
+ // return emitOpError("M-dimension mismatch.");
+ // if (rhsShape[1] != resShape[1])
+ // return emitOpError("N-dimension mismatch.");
+ // }
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index ed9418696c69b..34e0ac7b2d094 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -38,6 +38,8 @@
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/ADT/bit.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"
@@ -63,6 +65,8 @@ constexpr unsigned packedSizeInBitsForDefault =
16; // Minimum packing size per register for DPAS A.
constexpr unsigned packedSizeInBitsForDpasB =
32; // Minimum packing size per register for DPAS B.
+static const char *const operandLayoutNamePrefix = "layout_operand_";
+static const char *const resultLayoutNamePrefix = "layout_result_";
namespace {
@@ -686,7 +690,8 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
continue;
}
/// For every other user, use a generic attribute name.
- std::string attrName = "op" + std::to_string(operandNumber);
+ std::string attrName =
+ operandLayoutNamePrefix + std::to_string(operandNumber);
owner->setAttr(attrName, layout);
}
}
@@ -746,7 +751,7 @@ static LogicalResult attachLayoutAttributes(
for (auto [i, r] : llvm::enumerate(op->getResults())) {
auto layoutInfo = getLayoutInfoForResult(r);
if (layoutInfo) {
- auto attrName = "r" + std::to_string(i);
+ auto attrName = resultLayoutNamePrefix + std::to_string(i);
op->setAttr(attrName, layoutInfo);
/// Attach the layout attribute to the users of the result.
attachLayoutAttributeToUsers(r, layoutInfo);
@@ -819,16 +824,29 @@ static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
return distVecTyOrFailure.value();
}
-static Value reconcileDistribtedVecType(Value orig, VectorType expected,
- PatternRewriter &rewriter) {
+static Value reshapeDistributedVecType(Value orig, VectorType expected,
+ PatternRewriter &rewriter) {
assert(isa<VectorType>(orig.getType()) && "expecting vector type");
auto origVecType = cast<VectorType>(orig.getType());
/// No need to reconcile if the types are the same.
if (origVecType == expected)
return orig;
- auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
- expected, orig);
- return castOp.getResult(0);
+ auto castOp =
+ rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+ return castOp.getResult();
+}
+
+static SmallVector<NamedAttribute>
+filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> newAttrs;
+ for (auto attr : attrs) {
+ if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+ attr.getName().strref().contains(resultLayoutNamePrefix)) {
+ continue;
+ }
+ newAttrs.push_back(attr);
+ }
+ return newAttrs;
}
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -903,11 +921,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0
};
/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op will
-/// still contain the original op that will not be used by the yield op (and
-/// should be cleaned up later with dce). The yield op will bypass the
-/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because it
-/// is a uniform value accorss all work items within the subgroup.
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op
+/// will still contain the original op that will not be used by the yield op
+/// (and should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because
+/// it is a uniform value accorss all work items within the subgroup.
///
/// Example:
///
@@ -985,10 +1003,10 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
}
};
-/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
-/// case arguments for the store are passed through the warp op interface they
-/// would be propagated as returned values. Only the source vector for the store
-/// is distributed according to sg_map attribute.
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`.
+/// In case arguments for the store are passed through the warp op interface
+/// they would be propagated as returned values. Only the source vector for
+/// the store is distributed according to sg_map attribute.
///
/// Example:
///
@@ -1033,7 +1051,6 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
"Failed to distribute the type");
VectorType distributedTypeByWarpOp =
distributedTypeByWarpOpOrFailure.value();
- llvm::errs() << "distributed type: " << distributedTypeByWarpOp << "\n";
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -1050,21 +1067,21 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
/// For the value operand, there can be a conflict between the vector type
/// distributed by the warp op and (xegpu-specific) distributed type
- /// supported by the store op. We reconcile these mismatches by inserting a
- /// cast. These gets cancelled out later.
+ /// supported by the store op. We reconcile these mismatches by inserting
+ /// a cast. These gets cancelled out later.
auto storeNdDistributedValueTyOrFailure =
storeOp.getTensorDescType().getDistributedVectorType();
if (failed(storeNdDistributedValueTyOrFailure))
return rewriter.notifyMatchFailure(
storeOp, "Failed to get distributed vector type for the store op");
- newStoreOperands.push_back(reconcileDistribtedVecType(
+ newStoreOperands.push_back(reshapeDistributedVecType(
newWarpOp.getResult(newRetIndices[0]),
storeNdDistributedValueTyOrFailure.value(), rewriter));
newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
- rewriter.create<xegpu::StoreNdOp>(newWarpOp.getLoc(), TypeRange{},
- newStoreOperands);
- storeOp->setDialectAttrs(storeOp->getDialectAttrs());
+ rewriter.create<xegpu::StoreNdOp>(
+ newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
+ filterTemporaryLayoutAttributes(storeOp->getAttrs()));
rewriter.eraseOp(storeOp);
return success();
}
@@ -1074,8 +1091,9 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later with dce). The yield op will
-/// bypass the load's arguments. Only the loaded vector is distributed according
-/// to sg_map attribute and, tensor descriptor types is not distributed.
+/// bypass the load's arguments. Only the loaded vector is distributed
+/// according to sg_map attribute and, tensor descriptor types is not
+/// distributed.
///
/// Example:
///
@@ -1122,7 +1140,8 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, subgroupOp, /* new yielded values = */ loadOp.getTensorDesc(),
+ rewriter, subgroupOp,
+ /* new yielded values = */ loadOp.getTensorDesc(),
/* new yielded types = */ tensorDescTy, newRetIndices);
/// Create a new load op outside the warp op with the distributed vector
@@ -1135,13 +1154,14 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
loadOp, "Failed to get distributed vector type for the load op");
Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
- newWarpOp->getResult(newRetIndices[0]), loadOp->getAttrs());
+ newWarpOp->getResult(newRetIndices[0]),
+ filterTemporaryLayoutAttributes(loadOp->getAttrs()));
Value distributedVal = newWarpOp.getResult(operandIdx);
- /// There can be a conflict between the vector type distributed by the warp
- /// op and (xegpu-specific) distributed type supported by the load op. We
- /// reconcile these mismatches by inserting a cast.
- newLoadOp = reconcileDistribtedVecType(newLoadOp, distributedTypeByWarpOp,
- rewriter);
+ /// There can be a conflict between the vector type distributed by the
+ /// warp op and (xegpu-specific) distributed type supported by the load
+ /// op. We reconcile these mismatches by inserting a cast.
+ newLoadOp =
+ reshapeDistributedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
return success();
}
@@ -1161,8 +1181,9 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
unsigned operandIdx = operand->getOperandNumber();
xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+ auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
xegpu::LayoutAttr layoutOut =
- dpasOp->getAttrOfType<xegpu::LayoutAttr>("r0");
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
if (!layoutA || !layoutB || !layoutOut)
return rewriter.notifyMatchFailure(
dpasOp,
@@ -1211,7 +1232,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
}
for (auto i : newRetIndices) {
- newDpasOperands.push_back(reconcileDistribtedVecType(
+ newDpasOperands.push_back(reshapeDistributedVecType(
newWarpOp.getResult(i),
newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
}
@@ -1220,7 +1241,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
newDpasOperands, dpasOp->getAttrs());
Value disributedVal = newWarpOp.getResult(operandIdx);
/// Reconile the output type.
- disributedVal = reconcileDistribtedVecType(
+ disributedVal = reshapeDistributedVecType(
disributedVal,
getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
new file mode 100644
index 0000000000000..6369eb7dd035e
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -0,0 +1,70 @@
+gpu.module @test {
+gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
+ %c0 = arith.constant 0 : index
+ %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
+
+
+
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+ %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+ %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+}
>From 2f2ec101b06f5f38459eea46454b93d6e47c1278 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 14 Apr 2025 17:23:00 +0000
Subject: [PATCH 44/53] fix issues
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 12 +-
.../Transforms/XeGPUSubgroupDistribute.cpp | 129 ++++++++++++------
.../Dialect/XeGPU/subgroup-distribution.mlir | 108 +++++++--------
3 files changed, 151 insertions(+), 98 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index f8e04f9b3aef7..1dafc9936107e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -676,12 +676,12 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
// XeGPU_DpasOp
//===----------------------------------------------------------------------===//
LogicalResult DpasOp::verify() {
- // int64_t lhsRank = getLhsType().getRank();
- // int64_t rhsRank = getRhsType().getRank();
- // int64_t resRank = getResultType().getRank();
- // auto lhsShape = getLhsType().getShape();
- // auto rhsShape = getRhsType().getShape();
- // auto resShape = getResultType().getShape();
+ int64_t lhsRank = getLhsType().getRank();
+ int64_t rhsRank = getRhsType().getRank();
+ int64_t resRank = getResultType().getRank();
+ auto lhsShape = getLhsType().getShape();
+ auto rhsShape = getRhsType().getShape();
+ auto resShape = getResultType().getShape();
if (getAcc()) {
if (getAcc().getType() != getResultType())
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 34e0ac7b2d094..f64f0b5235705 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -21,6 +21,7 @@
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/PatternMatch.h"
@@ -679,17 +680,7 @@ void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
for (OpOperand &user : v.getUses()) {
Operation *owner = user.getOwner();
unsigned operandNumber = user.getOperandNumber();
- /// If the user is a DpasOp, set A, B or C layout attributes.
- if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
- if (operandNumber == 0)
- dpasOp.setALayoutAttr(layout);
- else if (operandNumber == 1)
- dpasOp.setBLayoutAttr(layout);
- else if (operandNumber == 2)
- dpasOp.setCLayoutAttr(layout);
- continue;
- }
- /// For every other user, use a generic attribute name.
+ /// Use a generic name for ease of querying the layout attribute later.
std::string attrName =
operandLayoutNamePrefix + std::to_string(operandNumber);
owner->setAttr(attrName, layout);
@@ -824,18 +815,66 @@ static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
return distVecTyOrFailure.value();
}
-static Value reshapeDistributedVecType(Value orig, VectorType expected,
- PatternRewriter &rewriter) {
- assert(isa<VectorType>(orig.getType()) && "expecting vector type");
- auto origVecType = cast<VectorType>(orig.getType());
- /// No need to reconcile if the types are the same.
- if (origVecType == expected)
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+ return xegpu::TensorDescType::get(
+ tensorDesc.getContext(), tensorDesc.getShape(),
+ tensorDesc.getElementType(), tensorDesc.getEncoding(),
+ xegpu::LayoutAttr());
+}
+
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+ PatternRewriter &rewriter) {
+ /// If orig and expected types are the same, return orig.
+ if (orig.getType() == expected)
return orig;
- auto castOp =
- rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
- return castOp.getResult();
+ /// If orig is a vector type, create a shape cast op to reconcile the types.
+ if (auto origVecType = isa<VectorType>(orig.getType())) {
+ auto castOp =
+ rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+ return castOp.getResult();
+ }
+ /// If orig is a tensor descriptor type, create an unrealized conversion cast
+ /// op to reconcile the types.
+ if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+ auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+ expected, orig);
+ return castOp.getResult(0);
+ }
+ llvm_unreachable("Unsupported type for reconciliation");
+ return orig;
}
+// static Value reconcileDistributedTensorDescTy(Value orig,
+// xegpu::TensorDescType expected,
+// PatternRewriter &rewriter) {
+// assert(isa<xegpu::TensorDescType>(orig.getType()) &&
+// "expecting tensor descriptor type");
+// auto origTensorDescTy = cast<xegpu::TensorDescType>(orig.getType());
+// /// No need to reconcile if the types are the same.
+// if (origTensorDescTy == expected)
+// return orig;
+// auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+// expected, orig);
+// return castOp.getResult(0);
+// }
+
+// // unify above 2 functions with a template
+// template <typename T>
+// static Value reconcileDistributedType(Value orig, T expected,
+// PatternRewriter &rewriter) {
+// if constexpr (std::is_same_v<T, VectorType>) {
+// return reconcileDistributedVecType(orig, expected, rewriter);
+// } else if constexpr (std::is_same_v<T, xegpu::TensorDescType>) {
+// return reconcileDistributedTensorDescTy(orig, expected, rewriter);
+// } else {
+// static_assert(llvm::is_one_of<T, VectorType,
+// xegpu::TensorDescType>::value,
+// "Unsupported type for reconciliation");
+// }
+// return orig;
+// }
+
static SmallVector<NamedAttribute>
filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
SmallVector<NamedAttribute> newAttrs;
@@ -951,7 +990,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
/// -> !xegpu.tensor_desc<4x8xf32>
///
/// ```
-struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const override {
@@ -993,8 +1032,11 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
newDescOperands.push_back(newWarpOp.getResult(i));
}
rewriter.setInsertionPointAfter(newWarpOp);
+ auto distributedTensorDescTy =
+ dropLayouts(descOp.getType()); /// Distributed tensor descriptor type
+ /// does not contain layout info.
auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
- newWarpOp.getLoc(), descOp.getType(), newDescOperands,
+ newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
descOp->getAttrs());
Value distributedVal = newWarpOp.getResult(operandIdx);
@@ -1027,7 +1069,7 @@ struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
/// !xegpu.tensor_desc<4x8xf32>
///
/// ```
-struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
+struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const override {
@@ -1065,19 +1107,24 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newStoreOperands;
- /// For the value operand, there can be a conflict between the vector type
+ /// For the value operand, there can be a mismatch between the vector type
/// distributed by the warp op and (xegpu-specific) distributed type
- /// supported by the store op. We reconcile these mismatches by inserting
- /// a cast. These gets cancelled out later.
+ /// supported by the store op. Type mismatch must be resolved using
+ /// appropriate cast op.
auto storeNdDistributedValueTyOrFailure =
storeOp.getTensorDescType().getDistributedVectorType();
if (failed(storeNdDistributedValueTyOrFailure))
return rewriter.notifyMatchFailure(
storeOp, "Failed to get distributed vector type for the store op");
- newStoreOperands.push_back(reshapeDistributedVecType(
+ newStoreOperands.push_back(resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]),
storeNdDistributedValueTyOrFailure.value(), rewriter));
- newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[1]));
+ /// For the tensor descriptor operand, the layout attibute is dropped after
+ /// distribution. Types needs to be resolved in this case also.
+ auto distributedTensorDescTy = dropLayouts(storeOp.getTensorDescType());
+ newStoreOperands.push_back(
+ resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
+ distributedTensorDescTy, rewriter));
rewriter.create<xegpu::StoreNdOp>(
newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
@@ -1117,7 +1164,7 @@ struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
/// %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
///
/// ```
-struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
+struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const override {
@@ -1161,13 +1208,13 @@ struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
/// warp op and (xegpu-specific) distributed type supported by the load
/// op. We reconcile these mismatches by inserting a cast.
newLoadOp =
- reshapeDistributedVecType(newLoadOp, distributedTypeByWarpOp, rewriter);
+ resolveDistributedTy(newLoadOp, distributedTypeByWarpOp, rewriter);
rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
return success();
}
};
-struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
+struct DpasDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
PatternRewriter &rewriter) const override {
@@ -1179,15 +1226,21 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
unsigned operandIdx = operand->getOperandNumber();
- xegpu::LayoutAttr layoutA = dpasOp.getALayoutAttr();
- xegpu::LayoutAttr layoutB = dpasOp.getBLayoutAttr();
+ auto layoutAName =
+ llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str();
+ auto layoutBName =
+ llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str();
auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
+ xegpu::LayoutAttr layoutA =
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
+ xegpu::LayoutAttr layoutB =
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
xegpu::LayoutAttr layoutOut =
dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
if (!layoutA || !layoutB || !layoutOut)
return rewriter.notifyMatchFailure(
dpasOp,
- "the xegpu::Dpas op lacks sg_map attribute for A, B or output");
+ "the xegpu::Dpas op lacks layout attribute for A, B or output");
auto distLhsTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
@@ -1232,7 +1285,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
}
for (auto i : newRetIndices) {
- newDpasOperands.push_back(reshapeDistributedVecType(
+ newDpasOperands.push_back(resolveDistributedTy(
newWarpOp.getResult(i),
newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
}
@@ -1241,7 +1294,7 @@ struct SubgroupOpDpas final : public gpu::WarpDistributionPattern {
newDpasOperands, dpasOp->getAttrs());
Value disributedVal = newWarpOp.getResult(operandIdx);
/// Reconile the output type.
- disributedVal = reshapeDistributedVecType(
+ disributedVal = resolveDistributedTy(
disributedVal,
getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
@@ -1266,8 +1319,8 @@ struct XeGPUSubgroupDistributePass final
void xegpu::populateXeGPUSubgroupDistributePatterns(
RewritePatternSet &patterns) {
- patterns.add<SubgroupOpTensorDescOp, SubgroupOpStoreNd, SubgroupOpLoadNd,
- SubgroupOpDpas>(patterns.getContext());
+ patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+ LoadNdDistribution, DpasDistribution>(patterns.getContext());
}
void XeGPUSubgroupDistributePass::runOnOperation() {
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index 6369eb7dd035e..7197ddfb286eb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -8,63 +8,63 @@ gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
}
}
-// -----
-gpu.module @test {
-gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
- %c0 = arith.constant 0 : index
- %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
- gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
+// %c0 = arith.constant 0 : index
+// %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+// %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+// gpu.return
+// }
+// }
-// -----
-gpu.module @test {
-gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
- %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
- xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
- gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+// %c0 = arith.constant 0 : index
+// %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+// %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+// gpu.return
+// }
+// }
-// -----
-gpu.module @test {
-gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
- gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+// %c0 = arith.constant 0 : index
+// %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+// %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+// gpu.return
+// }
+// }
-// -----
-gpu.module @test {
-gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
- %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
- %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
- %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
- gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+// %c0 = arith.constant 0 : index
+// %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+// %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+// %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+// gpu.return
+// }
+// }
-// -----
-gpu.module @test {
-gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
- %c0 = arith.constant 0 : index
- %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-}
+// // -----
+// gpu.module @test {
+// gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+// %c0 = arith.constant 0 : index
+// %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+// gpu.return
+// }
+// }
>From 2ae3543e7a56f8fb37f5ee86c23b980b451e1aac Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 14 Apr 2025 21:24:31 +0000
Subject: [PATCH 45/53] fix issues
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 49 ++++----
.../Dialect/XeGPU/subgroup-distribution.mlir | 108 +++++++++---------
2 files changed, 83 insertions(+), 74 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index f64f0b5235705..05d15a7c71e58 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -717,6 +717,11 @@ static LogicalResult attachLayoutAttributes(
/// If no results, move on.
if (op->getNumResults() == 0)
return WalkResult::advance();
+ /// If all the results are scalars, move on.
+ if (llvm::all_of(op->getResultTypes(),
+ [](Type t) { return t.isIntOrIndexOrFloat(); }))
+ return WalkResult::advance();
+
if (auto tensorDescTy =
dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
auto layoutInfo = getLayoutInfoForResult(op->getResult(0));
@@ -738,7 +743,7 @@ static LogicalResult attachLayoutAttributes(
op->erase();
return WalkResult::advance();
}
- /// Otherwise simply attach the sg_map to the op itself.
+ /// Otherwise simply attach the layout to the op itself.
for (auto [i, r] : llvm::enumerate(op->getResults())) {
auto layoutInfo = getLayoutInfoForResult(r);
if (layoutInfo) {
@@ -1199,14 +1204,19 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
if (failed(loadNdDistValueTyOrFailure))
return rewriter.notifyMatchFailure(
loadOp, "Failed to get distributed vector type for the load op");
+ auto distributedTensorDescTy =
+ dropLayouts(loadOp.getTensorDescType()); /// Distributed tensor
+ /// descriptor type does not
+ /// contain layout info.
Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
- newWarpOp->getResult(newRetIndices[0]),
+ resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
+ distributedTensorDescTy, rewriter),
filterTemporaryLayoutAttributes(loadOp->getAttrs()));
Value distributedVal = newWarpOp.getResult(operandIdx);
/// There can be a conflict between the vector type distributed by the
/// warp op and (xegpu-specific) distributed type supported by the load
- /// op. We reconcile these mismatches by inserting a cast.
+ /// op. Resolve these mismatches by inserting a cast.
newLoadOp =
resolveDistributedTy(newLoadOp, distributedTypeByWarpOp, rewriter);
rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
@@ -1274,29 +1284,28 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newDpasOperands;
SmallVector<VectorType> newDpasOperandExpectedTypes;
- /// Reconcile the distributed types with the original types.
+ /// Resolve the distributed types with the original types.
newDpasOperandExpectedTypes.push_back(
getDistributedVectorType(layoutA, dpasOp.getLhsType()));
newDpasOperandExpectedTypes.push_back(
getDistributedVectorType(layoutB, dpasOp.getRhsType()));
- if (dpasOp.getAcc()) {
- newDpasOperandExpectedTypes.push_back(
- getDistributedVectorType(layoutOut, dpasOp.getResultType()));
- }
-
- for (auto i : newRetIndices) {
- newDpasOperands.push_back(resolveDistributedTy(
- newWarpOp.getResult(i),
- newDpasOperandExpectedTypes[newDpasOperands.size()], rewriter));
+ auto distributedResultTy =
+ getDistributedVectorType(layoutOut, dpasOp.getResultType());
+ if (dpasOp.getAcc())
+ newDpasOperandExpectedTypes.push_back(distributedResultTy);
+
+ for (unsigned i = 0; i < newRetIndices.size(); i++) {
+ newDpasOperands.push_back(
+ resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
+ newDpasOperandExpectedTypes[i], rewriter));
}
- auto newDpasOp = rewriter.create<xegpu::DpasOp>(
- newWarpOp->getLoc(), distResultTypeByWarpOpOrFailure.value(),
- newDpasOperands, dpasOp->getAttrs());
+ Value newDpasOp = rewriter.create<xegpu::DpasOp>(
+ newWarpOp->getLoc(), distributedResultTy, newDpasOperands,
+ filterTemporaryLayoutAttributes(dpasOp->getAttrs()));
Value disributedVal = newWarpOp.getResult(operandIdx);
- /// Reconile the output type.
- disributedVal = resolveDistributedTy(
- disributedVal,
- getDistributedVectorType(layoutOut, dpasOp.getResultType()), rewriter);
+ /// Resolve the output type.
+ newDpasOp = resolveDistributedTy(
+ newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter);
rewriter.replaceAllUsesWith(disributedVal, newDpasOp);
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index 7197ddfb286eb..6369eb7dd035e 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -8,63 +8,63 @@ gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
}
}
-// // -----
-// gpu.module @test {
-// gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
-// %c0 = arith.constant 0 : index
-// %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-// %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-// gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
-// // -----
-// gpu.module @test {
-// gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
-// %c0 = arith.constant 0 : index
-// %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
-// %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-// gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ gpu.return
+}
+}
-// // -----
-// gpu.module @test {
-// gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-// %c0 = arith.constant 0 : index
-// %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-// gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
-// // -----
-// gpu.module @test {
-// gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
-// %c0 = arith.constant 0 : index
-// %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
-// %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
-// %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
-// gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+ %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+ %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
-// // -----
-// gpu.module @test {
-// gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
-// %c0 = arith.constant 0 : index
-// %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-// gpu.return
-// }
-// }
+// -----
+gpu.module @test {
+gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+}
>From 4c63916e791f4bca4ee7ac91d7d515a10f6bdea1 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 14 Apr 2025 22:21:41 +0000
Subject: [PATCH 46/53] fix issues
---
.../Dialect/XeGPU/subgroup-distribution.mlir | 72 +++++++++++++++++++
1 file changed, 72 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index 6369eb7dd035e..1b5b274eec9e3 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -1,3 +1,11 @@
+// RUN: mlir-opt -xegpu-subgroup-distribute -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @test_store_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+// CHECK: gpu.return
gpu.module @test {
gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
%c0 = arith.constant 0 : index
@@ -9,6 +17,12 @@ gpu.func @test_store_nd_1d(%arg0: memref<16xf32>){
}
// -----
+// CHECK-LABEL: gpu.func @test_store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x1xf16>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = vector.shape_cast %[[CST]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.module @test {
gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
%c0 = arith.constant 0 : index
@@ -22,6 +36,12 @@ gpu.func @test_store_nd_2d(%arg0: memref<16x16xf16>){
// -----
+// CHECK-LABEL: gpu.func @test_load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
gpu.module @test {
gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
%c0 = arith.constant 0 : index
@@ -34,6 +54,12 @@ gpu.func @test_load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
}
// -----
+// CHECK-LABEL: gpu.func @test_load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.module @test {
gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
%c0 = arith.constant 0 : index
@@ -46,6 +72,15 @@ gpu.func @test_load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
}
// -----
+// CHECK-LABEL: gpu.func @test_load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
+// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
+// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.module @test {
gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
%c0 = arith.constant 0 : index
@@ -59,6 +94,19 @@ gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x1
}
// -----
+// CHECK-LABEL: gpu.func @test_dpas
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %1:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
+// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>):
+// CHECK: gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK: }
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]]#0 : vector<8x1xf16> to vector<8xf16>
+// CHECK: %[[T3:.*]] = vector.shape_cast %[[T1]]#1 : vector<16x1xf16> to vector<16xf16>
+// CHECK: %[[T4:.*]] = vector.shape_cast %[[T1]]#2 : vector<8x1xf32> to vector<8xf32>
+// CHECK: %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[T4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG3]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T5]], %[[T6]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.module @test {
gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
%c0 = arith.constant 0 : index
@@ -68,3 +116,27 @@ gpu.func @test_dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: ve
gpu.return
}
}
+
+// -----
+// CHECK-LABEL: gpu.func @load_dpas_store
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+}
>From 2d9cfa3ffd40749198b5fd666f873f3a12ac57a1 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 15 Apr 2025 18:40:17 +0000
Subject: [PATCH 47/53] fix build issue
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 05d15a7c71e58..edfc143a6fb42 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -600,6 +600,8 @@ namespace {
/// Driver class for running the LayoutInfoPropagation analysis.
class RunLayoutInfoPropagation {
public:
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
+
RunLayoutInfoPropagation(Operation *op) : target(op) {
SymbolTableCollection symbolTable;
solver.load<DeadCodeAnalysis>();
>From 775d039bb7a5ba9fd91939411e2d69312879f1e0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 15 Apr 2025 18:46:55 +0000
Subject: [PATCH 48/53] refine verifier for gather/scatter
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 62 +++++++++-----------------
mlir/test/Dialect/XeGPU/invalid.mlir | 4 +-
2 files changed, 22 insertions(+), 44 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 1dafc9936107e..f5205c5e7e5bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -547,30 +547,18 @@ LogicalResult LoadGatherOp::verify() {
return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
auto chunkSize = tdescTy.getChunkSize();
- // for SIMT code, the value should be 1D vector with size of chunkSize.
- if (valueTy.getRank() == 1 && valueTy.getNumElements() != tdescShape[0]) {
- if (valueTy.getNumElements() != chunkSize) {
+
+ // a valid shape for SIMT case
+ if (valueTy.getRank() == 1 && valueTy.getNumElements() == chunkSize) {
+ if (tdescTy.getLayoutAttr())
return emitOpError()
- << "Result shape " << makeString(valueShape)
- << " is not a valid distribution for tensor descriptor "
- << tdescTy;
- } else { // valid SIMT code doesn't need LayoutAttr and TransposeAttr.
- if (tdescTy.getLayoutAttr())
- return emitOpError()
- << "TensorDesc doesn't need LayoutAttr for SIMT code";
- if (getTransposeAttr())
- return emitOpError() << "doesn't need TransposeAttr for SIMT code";
- }
- return success();
- } else if (valueTy.getRank() == 1 && tdescShape[0] == chunkSize) {
- // for 1D vector and valueTy.getNumElements() == tdescShape[0] case,
- // it is a valid SIMT code if chunkSize happens to be the same as
- // subgroup size, e.g., tensor_desc<16x16xf16, chunkSize = 16>
+ << "TensorDesc doesn't need LayoutAttr for SIMT code";
+ if (getTransposeAttr())
+ return emitOpError() << "doesn't need TransposeAttr for SIMT code";
return success();
}
- // For SIMD code verification.
- if (tdescTy.getRank() == 2) {
+ if (tdescTy.getRank() == 2 && valueTy.getRank() == 2) {
if (!getTransposeAttr())
return emitOpError("load of rank-2 tensor has to be transposed.");
transpose({1, 0}, tdescShape);
@@ -578,7 +566,8 @@ LogicalResult LoadGatherOp::verify() {
if (tdescShape != valueShape)
return emitOpError() << "Result shape " << makeString(valueShape)
- << " is not consistent with tensor descriptor "
+ << " is neither a valid distribution for SIMT nor "
+ "consistent with the tensor descriptor for SIMD "
<< tdescTy;
return success();
}
@@ -613,30 +602,18 @@ LogicalResult StoreScatterOp::verify() {
return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
auto chunkSize = tdescTy.getChunkSize();
- // for SIMT code, the value should be 1D vector with size of chunkSize.
- if (valueTy.getRank() == 1 && valueTy.getNumElements() != tdescShape[0]) {
- if (valueTy.getNumElements() != chunkSize) {
+
+ // a valid shape for SIMT case
+ if (valueTy.getRank() == 1 && valueTy.getNumElements() == chunkSize) {
+ if (tdescTy.getLayoutAttr())
return emitOpError()
- << "Value shape " << makeString(valueShape)
- << " is not a valid distribution for tensor descriptor "
- << tdescTy;
- } else { // valid SIMT code doesn't need LayoutAttr and TransposeAttr.
- if (tdescTy.getLayoutAttr())
- return emitOpError()
- << "TensorDesc doesn't need LayoutAttr for SIMT code";
- if (getTransposeAttr())
- return emitOpError() << "doesn't need TransposeAttr for SIMT code";
- }
- return success();
- } else if (valueTy.getRank() == 1 && tdescShape[0] == chunkSize) {
- // for 1D vector and valueTy.getNumElements() == tdescShape[0] case,
- // it is a valid SIMT code if chunkSize happens to be the same as
- // subgroup size, e.g., tensor_desc<16x16xf16, chunkSize = 16>
+ << "TensorDesc doesn't need LayoutAttr for SIMT code";
+ if (getTransposeAttr())
+ return emitOpError() << "doesn't need TransposeAttr for SIMT code";
return success();
}
- // for SIMD code verification.
- if (tdescTy.getRank() == 2) {
+ if (tdescTy.getRank() == 2 && valueTy.getRank() == 2) {
if (!getTransposeAttr())
return emitOpError("Store of a rank-2 tensor has to be transposed.");
transpose({1, 0}, tdescShape);
@@ -644,7 +621,8 @@ LogicalResult StoreScatterOp::verify() {
if (tdescShape != valueShape)
return emitOpError() << "Value shape " << makeString(valueShape)
- << " is not consistent with tensor descriptor "
+ << " is neither a valid distribution for SIMT nor "
+ "consistent with the tensor descriptor for SIMD "
<< tdescTy;
return success();
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index a02427b6e317b..2a7436807f5f4 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -255,7 +255,7 @@ func.func @test_load_gather_simt_1(%src: ui64) {
%0 = arith.constant dense<1>: vector<4xi1>
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Result shape [6] is not a valid distribution for tensor descriptor}}
+ // expected-error at +1 {{Result shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}}
%2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<6xf32>
return
}
@@ -266,7 +266,7 @@ func.func @test_store_scatter_simt_1(%src: ui64) {
%cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
%val = arith.constant dense<2.9>: vector<6xf32>
%1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
- // expected-error at +1 {{Value shape [6] is not a valid distribution for tensor descriptor}}
+ // expected-error at +1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}}
xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
return
}
>From 5520ce18138b5153d7ecb874fe10be78127d719e Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen at intel.com>
Date: Tue, 15 Apr 2025 18:59:39 +0000
Subject: [PATCH 49/53] update comments
---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 13 +++++++------
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 1 -
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 269e445c3790c..b865b80f0075e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -320,21 +320,22 @@ LogicalResult TensorDescType::verify(
// ---------------------------------------------------------------------
// Case 1: Regular loads/stores.
// ---------------------------------------------------------------------
-// Distributed vector shape must be:
-// [chunk_size / lane_data_size, lane_data_size]
-// If the tensor descriptor shape is 1D, first dimension is ignored (set to 1).
-// [lane_data_size]
+// The following conditions must be met:
+// * tensor_desc[0] == lane_layout[0]
+// Distributed vector is a 1D vector with shape:
+// [chunk_size]
// ---------------------------------------------------------------------
// Case 2: Block loads/stores
// ---------------------------------------------------------------------
// Additional definitions:
// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
// n_distribution_units = tensor_size / distribution_unit_size
+// fragment_size = n_distribution_units * lane_data_size
// Given above definitions, the following conditions must be met:
// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
-// Distributed vector shape must be:
-// [n_distribution_units, lane_data_size]
+// Distributed vector is a 1D vector with shape:
+// [fragment_size]
FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
// It only works for subgroup level layout, which only has lane_layout
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index f5205c5e7e5bc..4305c0431cc7e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -295,7 +295,6 @@ LogicalResult LoadNdOp::verify() {
}
// Check SIMD mode.
- // adjusted tensor descriptor shape tracks the expected shape of the result.
auto tdescShape = getShapeOf(tdescTy);
auto valueShape = getShapeOf(valueTy);
>From 6abc12a97955425f207f576b827911a9f3cb0fdc Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 15 Apr 2025 20:34:56 +0000
Subject: [PATCH 50/53] fix tests
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 2 +-
.../Dialect/XeGPU/subgroup-distribution.mlir | 2 +-
.../XeGPU/subgroup-map-propagation.mlir | 320 +++++++++---------
3 files changed, 162 insertions(+), 162 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index edfc143a6fb42..9bbe0d753d92d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -678,7 +678,7 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
-void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
+static void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
for (OpOperand &user : v.getUses()) {
Operation *owner = user.getOwner();
unsigned operandNumber = user.getOperandNumber();
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
index 1b5b274eec9e3..fd1be62080e57 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -96,7 +96,7 @@ gpu.func @test_load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x1
// -----
// CHECK-LABEL: gpu.func @test_dpas
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %1:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]
+// CHECK: %[[T1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]
// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>):
// CHECK: gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index 1ae4348af33e6..a5468681e68dc 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -2,27 +2,27 @@
// CHECK: function: test_dpas_f16:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -40,17 +40,17 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
// -----
// CHECK: function: test_dpas_i8:
// CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
@@ -62,27 +62,27 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
// -----
// CHECK: function: test_load_with_transpose_effect:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -99,29 +99,29 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
// -----
// CHECK: function: test_vector_transpose:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -139,19 +139,19 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
// -----
// CHECK: function: test_extf_truncf:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -164,29 +164,29 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
// -----
// CHECK: function: test_load_gather_with_transpose_effect:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -204,17 +204,17 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
// -----
// CHECK: function: test_load_gather_1d:
// CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T1]] = xegpu.load %[[T0]], %[[CST0]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
%cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -227,15 +227,15 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
// -----
// CHECK: function: test_store_scatter_with_transpose_effect:
// CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
%cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
%cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -248,15 +248,15 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
// -----
// CHECK: function: test_store_scatter_1d:
// CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
%cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -268,27 +268,27 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
// -----
// CHECK: function: test_vector_bitcast_i16_to_i8:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
@@ -305,29 +305,29 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
// -----
// CHECK: function: test_vector_bitcast_i8_to_f16:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
@@ -345,21 +345,21 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
// -----
// CHECK: function: test_binary_op_one_use:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -373,23 +373,23 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
// -----
// CHECK: function: test_binary_op_multiple_uses:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 3
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -404,39 +404,39 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
// -----
// CHECK: function: test_for_op:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 128 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 16 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : scf.for
-// CHECK-NEXT: sg_map for result #0: Not assigned.
-// CHECK-NEXT: sg_map for result #1: Not assigned.
-// CHECK-NEXT: sg_map for result #2: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: Not assigned.
+// CHECK-NEXT: layout for result #1: Not assigned.
+// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
@@ -460,23 +460,23 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
// -----
// CHECK: function: test_if_single_use:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : scf.if
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = scf.if %arg2 -> (vector<16x16xf16>) {
@@ -494,25 +494,25 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
// -----
// CHECK: function: test_if_multiple_uses:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 4
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : scf.if
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = scf.if %arg2 -> (vector<16x16xf16>) {
@@ -531,13 +531,13 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
// -----
// CHECK: function: test_vector_outer_reduction:
// CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
@@ -548,13 +548,13 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
// -----
// CHECK: function: test_vector_inner_reduction:
// CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
>From 379e18671ad3e5dec0efdf85784d1a6799591bf0 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 16 Apr 2025 16:58:34 +0000
Subject: [PATCH 51/53] fix
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 44 ++++++++++---------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9bbe0d753d92d..532509fcc0910 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -107,10 +107,11 @@ using LaneData = Layout;
/// LayoutInfo
///===----------------------------------------------------------------------===///
-/// Helper class for tracking the analysis state of a value. For SGPropagation,
-/// the analysis state is simply the lane_layout and lane_data of each value.
-/// Purpose of this analysis to propagate some unique layout for each value in
-/// the program starting from some known values (like DPAS, StoreNd, etc.).
+/// Helper class for tracking the analysis state of a value. For layout
+/// propagation, the analysis state is simply the lane_layout and lane_data of
+/// each value. Purpose of this analysis to propagate some unique layout for
+/// each value in the program starting from a set of anchor operations (like
+/// DPAS, StoreNd, etc.).
///
/// Given this, LayoutInfo satisifies the following properties:
/// 1) LayoutInfo is a lattice with two states - assigned and not assigned.
@@ -173,7 +174,7 @@ LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
/// Since this is a backward analysis, join method is not used.
LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
- llvm_unreachable("Join should not be triggered by SGMapPropagation.");
+ llvm_unreachable("Join should not be triggered by layout propagation.");
}
/// Get the transposed layout according to the given permutation.
@@ -564,14 +565,15 @@ void LayoutInfoPropagation::visitStoreScatterOp(
xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
/// Currently, for 2D StoreScatterOp we expect that the height dimension of
- /// the tensor descriptor is evenly divisible by the subgroup size.
- /// TODO: Add support for other 2D shapes.
+ /// the tensor descriptor is equal to the subgroup size. This is ensured by
+ /// the op verifier.
auto tdescShape = storeScatter.getTensorDescType().getShape();
- if (tdescShape.size() > 1 && tdescShape[0] % subgroupSize != 0) {
- storeScatter.emitError("Height dimension of the tensor descriptor should "
- "be evenly divisible by the subgroup size.");
- return;
- }
+ if (tdescShape.size() > 1)
+ assert(
+ tdescShape[0] == subgroupSize &&
+ "Expected the first dimension of 2D tensor descriptor to be equal to "
+ "subgroup size.");
+
auto valueLayout = getDefaultLayoutInfo(storeScatter.getValueType());
LayoutInfo storeScatterLayout = valueLayout;
if (storeScatter.getTranspose()) {
@@ -732,7 +734,7 @@ static LogicalResult attachLayoutAttributes(
return WalkResult::interrupt();
}
- /// Clone the op, attach the sg_map to the result tensor descriptor, and
+ /// Clone the op, attach the layout to the result tensor descriptor, and
/// remove the original op.
OpBuilder builder(op);
auto *newOp = builder.clone(*op);
@@ -976,7 +978,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
/// Example:
///
/// ```
-/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// #layout_8 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (!xegpu.tensor_desc<4x8xf32>) {
/// ...
@@ -1019,7 +1021,7 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
- descOp, "the tensor descriptor lacks sg_map attribute");
+ descOp, "the tensor descriptor lacks layout attribute");
SmallVector<size_t> newRetIndices;
SmallVector<Value> newYieldValues;
@@ -1055,12 +1057,12 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`.
/// In case arguments for the store are passed through the warp op interface
/// they would be propagated as returned values. Only the source vector for
-/// the store is distributed according to sg_map attribute.
+/// the store is distributed according to layout attribute.
///
/// Example:
///
/// ```
-/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// #layout_8 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
@@ -1091,7 +1093,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
- storeOp, "the source tensor descriptor lacks sg_map attribute");
+ storeOp, "the source tensor descriptor lacks layout attribute");
auto distributedTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
@@ -1146,13 +1148,13 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later with dce). The yield op will
/// bypass the load's arguments. Only the loaded vector is distributed
-/// according to sg_map attribute and, tensor descriptor types is not
+/// according to layout attribute and, tensor descriptor types is not
/// distributed.
///
/// Example:
///
/// ```
-/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// #layout_8 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (vector<4x1xf32>) {
/// ...
@@ -1186,7 +1188,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
- loadOp, "the source tensor descriptor lacks sg_map attribute");
+ loadOp, "the source tensor descriptor lacks layout attribute");
unsigned operandIdx = operand->getOperandNumber();
VectorType distributedTypeByWarpOp =
>From aa7dbe150e307c294093cb847f98df4f3268da45 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 16 Apr 2025 20:10:05 +0000
Subject: [PATCH 52/53] fix
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 370 +++++++++++-------
1 file changed, 227 insertions(+), 143 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 532509fcc0910..f979c7bb879b6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -680,7 +680,37 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
}
}
-static void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// LayoutAttrAssignment
+///===----------------------------------------------------------------------===///
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+ LayoutAttrAssignment(Operation *top,
+ function_ref<LayoutInfo(Value)> getLayout)
+ : getAssignedLayout(getLayout), top(top) {}
+
+ LogicalResult run();
+
+private:
+ LogicalResult assign(Operation *op);
+ void assignToUsers(Value v, xegpu::LayoutAttr layout);
+ xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+ LogicalResult resolveConflicts();
+ function_ref<LayoutInfo(Value)>
+ getAssignedLayout; // Callable to get the layout of a value based on the
+ // layout propagation analysis.
+ Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
for (OpOperand &user : v.getUses()) {
Operation *owner = user.getOwner();
unsigned operandNumber = user.getOperandNumber();
@@ -691,93 +721,109 @@ static void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
}
}
-static LogicalResult attachLayoutAttributes(
- Operation *top, llvm::function_ref<LayoutInfo(Value)> getPropagatedLayout) {
- /// Helper to convert the layout info to the xegpu::LayoutAttr.
- auto getLayoutInfoForResult = [&](Value r) -> xegpu::LayoutAttr {
- auto layout = getPropagatedLayout(r);
- if (!layout.isAssigned())
- return {};
- SmallVector<int, 2> laneLayout, laneData;
- for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
- layout.getDataAsArrayRef())) {
- laneLayout.push_back(static_cast<int>(layout));
- laneData.push_back(static_cast<int>(data));
- }
- return xegpu::LayoutAttr::get(r.getContext(), laneLayout, laneData);
- };
- /// Attach the layout attributes to the results of the operations.
- auto walkResult = top->walk([&](Operation *op) {
- /// For function ops, propagate the argument layout to the users.
- if (auto func = dyn_cast<FunctionOpInterface>(op)) {
- for (auto arg : func.getArguments()) {
- auto layoutInfo = getLayoutInfoForResult(arg);
- if (layoutInfo) {
- attachLayoutAttributeToUsers(arg, layoutInfo);
- }
- }
- return WalkResult::advance();
- }
- /// If no results, move on.
- if (op->getNumResults() == 0)
- return WalkResult::advance();
- /// If all the results are scalars, move on.
- if (llvm::all_of(op->getResultTypes(),
- [](Type t) { return t.isIntOrIndexOrFloat(); }))
- return WalkResult::advance();
-
- if (auto tensorDescTy =
- dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
- auto layoutInfo = getLayoutInfoForResult(op->getResult(0));
- if (!layoutInfo) {
- LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
- return WalkResult::interrupt();
- }
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+ auto layout = getAssignedLayout(v);
+ if (!layout.isAssigned())
+ return {};
+ SmallVector<int, 2> laneLayout, laneData;
+ for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+ layout.getDataAsArrayRef())) {
+ laneLayout.push_back(static_cast<int>(layout));
+ laneData.push_back(static_cast<int>(data));
+ }
+ return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
- /// Clone the op, attach the layout to the result tensor descriptor, and
- /// remove the original op.
- OpBuilder builder(op);
- auto *newOp = builder.clone(*op);
- auto newTensorDescTy = xegpu::TensorDescType::get(
- tensorDescTy.getContext(), tensorDescTy.getShape(),
- tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
- layoutInfo);
- newOp->getResult(0).setType(newTensorDescTy);
- op->replaceAllUsesWith(newOp->getResults());
- op->erase();
- return WalkResult::advance();
- }
- /// Otherwise simply attach the layout to the op itself.
- for (auto [i, r] : llvm::enumerate(op->getResults())) {
- auto layoutInfo = getLayoutInfoForResult(r);
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+ /// For function ops, propagate the function argument layout to the users.
+ if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+ for (auto arg : func.getArguments()) {
+ auto layoutInfo = getLayoutAttrForValue(arg);
if (layoutInfo) {
- auto attrName = resultLayoutNamePrefix + std::to_string(i);
- op->setAttr(attrName, layoutInfo);
- /// Attach the layout attribute to the users of the result.
- attachLayoutAttributeToUsers(r, layoutInfo);
+ assignToUsers(arg, layoutInfo);
}
}
+ return success();
+ }
+ /// If no results, move on.
+ if (op->getNumResults() == 0)
+ return success();
+ /// If all the results are scalars, move on.
+ if (llvm::all_of(op->getResultTypes(),
+ [](Type t) { return t.isIntOrIndexOrFloat(); }))
+ return success();
+ /// If the result is a tensor descriptor, attach the layout to the tensor
+ /// descriptor itself.
+ if (auto tensorDescTy =
+ dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+ auto layoutInfo = getLayoutAttrForValue(op->getResult(0));
+ if (!layoutInfo) {
+ LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+ return failure();
+ }
+
+ /// Clone the op, attach the layout to the result tensor descriptor, and
+ /// remove the original op.
+ OpBuilder builder(op);
+ auto *newOp = builder.clone(*op);
+ auto newTensorDescTy = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+ newOp->getResult(0).setType(newTensorDescTy);
+ op->replaceAllUsesWith(newOp->getResults());
+ op->erase();
+ return success();
+ }
+ /// Otherwise simply attach the layout to the op itself.
+ for (auto [i, r] : llvm::enumerate(op->getResults())) {
+ auto layoutInfo = getLayoutAttrForValue(r);
+ if (layoutInfo) {
+ auto attrName = resultLayoutNamePrefix + std::to_string(i);
+ op->setAttr(attrName, layoutInfo);
+ /// Attach the layout attribute to the users of the result.
+ assignToUsers(r, layoutInfo);
+ }
+ }
+ return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+ auto walkResult = top->walk([&](Operation *op) {
+ if (failed(assign(op)))
+ return WalkResult::interrupt();
return WalkResult::advance();
});
- return failure(walkResult.wasInterrupted());
-}
+ if (walkResult.wasInterrupted())
+ return failure();
-static LogicalResult resolveLayoutConflicts(Operation *top) {
- /// TODO: Implement the layout conflict resolution.
- return success();
+ return resolveConflicts();
}
+/// TODO: Implement the layout conflict resolution. This must check mainly two
+/// things:
+// 1) Can a layout be supported by the op? (need to query the target
+/// HW info)
+// 2) Do all the operands have the required layout? If not, can it
+/// be reeolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
namespace {
///===----------------------------------------------------------------------===///
/// SIMT Distribution Patterns
///===----------------------------------------------------------------------===///
-/// Returns the distributed vector type for a source vector type according to
-/// the lane_layout. We simply divide each dimension of tensor descriptor shape
-/// by corresponding lane_layout dimension. If array_length > 1, that is
-/// appended to the front of the disributed shape.
+/// Helper function to get distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If array_length >
+/// 1, that is appended to the front of the disributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
///
/// Examples:
/// | original vector shape | lane_layout | distributed vector shape |
@@ -810,6 +856,8 @@ FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
return VectorType::get(distributedShape, originalType.getElementType());
}
+/// Get the distributed vector type for a source vector type according to a
+/// xegpu::LayoutAttr.
static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
VectorType originalType) {
auto shape = originalType.getShape();
@@ -824,13 +872,32 @@ static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
return distVecTyOrFailure.value();
}
+/// Drop the layout attribute from the tensor descriptor type if layout is
+/// present.
static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+ if (tensorDesc.getLayoutAttr() == xegpu::LayoutAttr())
+ return tensorDesc;
+
return xegpu::TensorDescType::get(
tensorDesc.getContext(), tensorDesc.getShape(),
tensorDesc.getElementType(), tensorDesc.getEncoding(),
xegpu::LayoutAttr());
}
+/// Helper function to resolve types if the distributed type out of
+/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
+/// Example 1:
+/// distributed type: vector<8x1xf32>
+/// expected type: vector<8xf32>
+/// resolved using,
+/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
+/// Example 2:
+/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
+// expected type: xegpu.tensor_desc<8x16xf32>
+/// resolved using,
+/// %0 = xegpu.unrealized_conversion_cast %1 :
+/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
+/// xegpu.tensor_desc<8x16xf32>
template <typename T>
static Value resolveDistributedTy(Value orig, T expected,
PatternRewriter &rewriter) {
@@ -854,36 +921,9 @@ static Value resolveDistributedTy(Value orig, T expected,
return orig;
}
-// static Value reconcileDistributedTensorDescTy(Value orig,
-// xegpu::TensorDescType expected,
-// PatternRewriter &rewriter) {
-// assert(isa<xegpu::TensorDescType>(orig.getType()) &&
-// "expecting tensor descriptor type");
-// auto origTensorDescTy = cast<xegpu::TensorDescType>(orig.getType());
-// /// No need to reconcile if the types are the same.
-// if (origTensorDescTy == expected)
-// return orig;
-// auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
-// expected, orig);
-// return castOp.getResult(0);
-// }
-
-// // unify above 2 functions with a template
-// template <typename T>
-// static Value reconcileDistributedType(Value orig, T expected,
-// PatternRewriter &rewriter) {
-// if constexpr (std::is_same_v<T, VectorType>) {
-// return reconcileDistributedVecType(orig, expected, rewriter);
-// } else if constexpr (std::is_same_v<T, xegpu::TensorDescType>) {
-// return reconcileDistributedTensorDescTy(orig, expected, rewriter);
-// } else {
-// static_assert(llvm::is_one_of<T, VectorType,
-// xegpu::TensorDescType>::value,
-// "Unsupported type for reconciliation");
-// }
-// return orig;
-// }
-
+/// Helper function to filter out the temporary layout attributes attached
+/// during the layout assignment process. These are not needed after going to
+/// SIMT.
static SmallVector<NamedAttribute>
filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
SmallVector<NamedAttribute> newAttrs;
@@ -968,31 +1008,32 @@ struct MoveFuncBodyToWarpExecuteOnLane0
}
};
-/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op
-/// will still contain the original op that will not be used by the yield op
-/// (and should be cleaned up later with dce). The yield op will bypass the
-/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because
-/// it is a uniform value accorss all work items within the subgroup.
+/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
+/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
+/// arguments. Tensor descriptor shape is not distributed because it is a
+/// uniform value accorss all work items within the subgroup. However, the
+/// layout information is dropped in the new tensor descriptor type.
///
/// Example:
///
/// ```
-/// #layout_8 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
-/// (!xegpu.tensor_desc<4x8xf32>) {
+/// (!xegpu.tensor_desc<4x8xf32, #lo0>) {
/// ...
/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
-/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
/// vector.yield %td
/// }
/// ```
/// To
/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
/// ...
/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
-/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
/// vector.yield %arg0, %dead
/// }
/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
@@ -1054,27 +1095,34 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
}
};
-/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`.
-/// In case arguments for the store are passed through the warp op interface
-/// they would be propagated as returned values. Only the source vector for
-/// the store is distributed according to layout attribute.
+/// Distribute a store_nd op at the end of enclosing
+/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
+/// through the warp op interface they would be propagated as returned values.
+/// Source vector is distributed based on lane layout. Appropriate cast ops are
+/// inserted if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
-/// #layout_8 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
-/// !xegpu.tensor_desc<4x8xf32>
+/// !xegpu.tensor_desc<4x8xf32, #lo0>
/// }
/// ```
/// To
/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
-/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32>
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
+/// #lo0>
/// }
-/// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
+/// %1 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+/// #lo0>
+/// -> !xegpu.tensor_desc<4x8xf32>
+/// xegpu.store_nd %0, %1: vector<4xf32>,
/// !xegpu.tensor_desc<4x8xf32>
///
/// ```
@@ -1143,34 +1191,39 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
}
};
-/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// Distribute a load_nd op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
-/// the yield op (and should be cleaned up later with dce). The yield op will
+/// the yield op (and should be cleaned up later). The yield op will
/// bypass the load's arguments. Only the loaded vector is distributed
-/// according to layout attribute and, tensor descriptor types is not
-/// distributed.
+/// according to lane layout and, tensor descriptor types is not
+/// distributed. Appropriate cast ops are inserted if the distributed types does
+/// not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
-/// #layout_8 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (vector<4x1xf32>) {
/// ...
-/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32> ->
+/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #lo0> ->
/// vector<4x8xf32>
/// gpu.yield %ld
/// }
/// ```
/// To
/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #lo0>) {
/// ...
-/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32> ->
-/// vector<4x8xf32> gpu.yield %arg0, %arg1
+/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #lo0> ->
+/// vector<4x8xf32> gpu.yield %dead, %arg0
/// }
-/// %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
+/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+/// #lo0> -> !xegpu.tensor_desc<4x8xf32>
+/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
+/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
///
/// ```
struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
@@ -1228,6 +1281,40 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
}
};
+/// Distribute a dpas op feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later). The yield op will
+/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
+/// distributed types does not match expected xegpu SIMT types.
+/// Example:
+/// ```
+/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
+/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (vector<8x1xf32>) {
+/// ...
+/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
+/// vector<8x16xf32>
+/// gpu.yield %dpas
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
+/// vector<8x1xf16>, vector<16x1xf16>) {
+/// ...
+/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
+/// -> vector<8x16xf32>
+/// gpu.yield %dead, %arg0, %arg1
+/// }
+/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
+/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
+/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
+/// vector<8xf32>
+/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
+/// ```
struct DpasDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
@@ -1347,18 +1434,15 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
auto getPropagatedLayout = [&](Value val) {
return analyis.getLayoutInfo(val);
};
- if (failed(attachLayoutAttributes(getOperation(), getPropagatedLayout)))
- signalPassFailure();
- if (failed(resolveLayoutConflicts(getOperation())))
+
+ /// Assign xegpu::LayoutAttr to all ops and their users based on the layout
+ /// propagation analysis result.
+ LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
+ if (failed(layoutAssignment.run()))
signalPassFailure();
- /// Move all operations inside a GPU functions inside
- /// gpu.warp_execute_on_lane0.
- /// We want to avoid ops from hoisted out of the gpu.warp_execute_on_lane0
- /// region.
- // GreedyRewriteConfig config;
- // config.cseConstants = false;
- // config.fold = false;
- // config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
+
+ /// Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
+ /// operation.
{
RewritePatternSet patterns(&getContext());
patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
>From ca5c7e91a3ca9119b4ca3499ce9875e495abb559 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 16 Apr 2025 20:58:44 +0000
Subject: [PATCH 53/53] fix comments
---
.../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index f979c7bb879b6..a49574b479e3d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -804,12 +804,12 @@ LogicalResult LayoutAttrAssignment::run() {
return resolveConflicts();
}
-/// TODO: Implement the layout conflict resolution. This must check mainly two
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
/// things:
-// 1) Can a layout be supported by the op? (need to query the target
-/// HW info)
-// 2) Do all the operands have the required layout? If not, can it
-/// be reeolved using a layout conversion?
+/// 1) Is a given layout supported by the op? (need to query the target
+/// HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+/// be resolved using a layout conversion?
LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
namespace {
More information about the Mlir-commits
mailing list