[Mlir-commits] [mlir] [MLIR][XeGPU] Remove create tdesc op from xegpu dialect (PR #182804)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sun Feb 22 21:21:50 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Nishant Patel (nbpatel)
<details>
<summary>Changes</summary>
---
Patch is 93.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182804.diff
11 Files Affected:
- (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td (+13-164)
- (modified) mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp (-53)
- (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp (+1-1)
- (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp (-23)
- (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp (+1-69)
- (modified) mlir/test/Dialect/XeGPU/invalid.mlir (-337)
- (modified) mlir/test/Dialect/XeGPU/ops.mlir (+1-287)
- (modified) mlir/test/Dialect/XeGPU/propagate-layout.mlir (+2-34)
- (modified) mlir/test/Dialect/XeGPU/xegpu-blocking.mlir (-149)
- (modified) mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir (-201)
- (modified) mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp (+1-3)
``````````diff
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 6d21aa9295716..7aff11d56a82a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -648,107 +648,6 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
let hasVerifier = 1;
}
-def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
- let summary = "create scattered tensor descriptors (TensorDesc).";
- let description = [{
- "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
- a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
- is for creating continuous subviews, "create_tdesc" is for creating non-continuous
- (scattered) subviews, allowing each lane in a subgroup specifying their own offset.
- It accepts the following parameters:
-
- Arguments:
-
- - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
- memory object.
-
- - `offsets`: a vector containing offsets of each access point. Its size
- is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
- implying each element in the vector corresponds to a SIMT lane in the subgroup.
-
- Results:
- - `res`: scattered tensor descriptor
-
- The first dimension of the result TensorDesc corresponds to lanes, so it should
- match the dimension of offsets. It may also has a second dimension corresponding to
- the chunk_size if the chunk size is larger than 1.
-
- Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
- ```mlir
- %a = memref.alloc() : memref<1024xf32>
- %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
- %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32>
- ```
-
- Example 2: It assumes subgroup size is 4, and each workitem access 8 elements.
- It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
- ```mlir
- %0 = memref.alloc() : memref<1024xf32>
- %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
- ```
-
- Example 3: It is similar to Example 2, but there is some overlaps among workitems.
- It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
- ```mlir
- %0 = memref.alloc() : memref<1024xf32>
- %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex>
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
- ```
- }];
-
- let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source,
- XeGPU_OffsetType:$offsets);
- let results = (outs XeGPU_TensorDesc:$TensorDesc);
-
- let builders = [
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
- "llvm::ArrayRef<OpFoldResult>": $offsets)>,
- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
- "llvm::ArrayRef<int64_t>": $offsets)>,
- ];
-
- let assemblyFormat = [{
- $source `,` $offsets attr-dict `:` type($source) `,` type($offsets) `->` qualified(type($TensorDesc))
- }];
-
- let extraClassDeclaration = [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
-
- mlir::VectorType getOffsetsType() {
- return getOffsets().getType();
- }
-
- size_t getNumOffsets() {
- return getOffsetsType().getNumElements();
- }
-
- mlir::Value getViewSource() { return getSource(); }
-
- unsigned getSourceMemorySpace() {
- auto srcTy = getSource().getType();
- if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
- auto attr = memrefTy.getMemorySpace();
- if (attr) {
- if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
- return static_cast<unsigned>(intAttr.getInt());
- if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
- return static_cast<unsigned>(memSpaceAttr.getValue());
- }
- }
- // take global as default memory scope.
- return static_cast<unsigned>(MemorySpace::Global);
- }
-
- }];
-
- let hasVerifier = 1;
-}
-
def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
let summary = "prefetches a set of scattered data points to cache";
@@ -764,11 +663,9 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
Arguments:
- `source`: represents the memory region to be loaded from, which can be either a
- tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
- tensor_desc cannot be used at lane level.
+ 1D memref or pointer (ui64, ui32, i64 or i32).
- - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from source.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 at lane level. scalar offset is also valid for lane level.
@@ -794,7 +691,6 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> {
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
The source operand could be a raw pointer (ui64, ui32, i64, i32).
- Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
%0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
@@ -896,11 +792,9 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
Arguments:
- `source`: represents the memory region to be loaded from, which can be either a
- tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
- tensor_desc cannot be used at lane level.
+ 1D memref or pointer (ui64, ui32, i64 or i32).
- - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from source.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 at lane level. scalar offset is also valid for lane level.
@@ -918,32 +812,9 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
Results:
- `res`: represents loaded data
-
- Example 1 (Workgroup level):
- ```mlir
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>},
- layout = #xegpu.layout<sg_layout = [8], sg_data = [32]>>
- : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
- vector<256xi1> -> vector<256xf32>
- ```
-
- Example 2 (Subgroup level):
- ```mlir
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>},
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>>
- : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
- vector<16xi1> -> vector<16x8xf32>
- ```
-
- Example 3 (Subgroup level):
- A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
- It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
- The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc
- for the restriction of memref.
+ Example 1 (Subgroup level):
+ A variant accepts memref as base pointer or the source operand
+ could be a raw pointer (ui64, ui32, i64, i32).
```mlir
%a = memref.alloc() : memref<1024xf32>
%offsets = vector.step : vector<16xindex>
@@ -955,7 +826,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
: memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
```
- Example 4 (lane level):
+ Example 2 (lane level):
lane level only accepts the offsets variant. chunk_size can be inferred from result
type. In this example, chunk_size is 8.
```mlir
@@ -1067,11 +938,9 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
- `value`: represents the data to be stored.
- `dest`: represents the memory region to be stored to, which can be either a
- tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
- In case of tensor_desc, offsets come from the producer create_tdesc op.
- tensor_desc cannot be used at lane level.
+ 1D memref or pointer (ui64, ui32, i64 or i32).
- - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from dest.
offsets is a vector of `index` type and vector length is either the subgroup size
or 1 at lane level. scalar offset is also valid for lane level.
@@ -1087,29 +956,9 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
to be stored. Only valid at workgroup and subgroup levels.
- Example 1 (Workgroup level):
- ```mlir
- xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>,
- layout = #xegpu.layout<sg_layout = [8], sg_data = [16]>}>
- : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1>
- ```
-
- Example 2 (Subgroup level):
- ```mlir
- xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>,
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 8]>}>
- : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
- ```
-
- Example 3 (Subgroup level):
- A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
- It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
+ Example 1 (Subgroup level):
+ A variant accepts memref as base pointer and an offset.
The dest operand could be a raw pointer (uint64_t).
- Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
%val = arith.constant dense<0.0> : vector<16xf32>
@@ -1122,7 +971,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
: vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex>
```
- Example 4 (Lane level):
+ Example 2 (Lane level):
Lane level IR only accepts the offsets variant. chunk_size can be inferred from value
type. In this example, chunk_size is 8.
```mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 3aba0f5070764..29e8419173aa2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -739,59 +739,6 @@ LogicalResult UpdateNdOffsetOp::verify() {
return success();
}
-//===----------------------------------------------------------------------===//
-// XeGPU_CreateDescOp
-//===----------------------------------------------------------------------===//
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
- TensorDescType TensorDesc, Value source,
- llvm::ArrayRef<OpFoldResult> offsets) {
- auto loc = source.getLoc();
- int64_t size = static_cast<int64_t>(offsets.size());
- auto type = VectorType::get(size, builder.getIndexType());
- auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
- auto offset = vector::FromElementsOp::create(builder, loc, type, values);
- build(builder, state, TensorDesc, source, offset);
-}
-
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
- TensorDescType TensorDesc, Value source,
- llvm::ArrayRef<int64_t> offsets) {
- auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
- build(builder, state, TensorDesc, source, ofrs);
-}
-
-LogicalResult CreateDescOp::verify() {
- auto tdescTy = getTensorDescType();
-
- if (!tdescTy.isScattered())
- return emitOpError("Expects a scattered TensorDesc.\n");
-
- // Memory space of created TensorDesc should match with the source.
- // Both source and TensorDesc are considered for global memory by default,
- // if the memory scope attr is not specified. If source is an integer,
- // it is considered as ptr to global memory.
- auto srcMemorySpace = getSourceMemorySpace();
- auto tdescMemorySpace = static_cast<unsigned>(tdescTy.getMemorySpace());
- if (srcMemorySpace != tdescMemorySpace)
- return emitOpError("Memory space mismatch.")
- << " Source: " << srcMemorySpace
- << ", TensorDesc: " << tdescMemorySpace;
-
- // check total size
- auto chunkSize = tdescTy.getChunkSizeAsInt();
- SmallVector<int64_t> shape(getOffsetsType().getShape());
- if (chunkSize != 1)
- shape.push_back(chunkSize);
-
- auto tdescShape = getShapeOf(tdescTy);
- if (shape != tdescShape)
- return emitOpError("Incorrect TensorDesc shape. ")
- << "Expected is " << makeString(shape) << "\n";
-
- return success();
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_PrefetchOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 49b66d2a8f6f6..cb269330cff46 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -160,7 +160,7 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
- if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
+ if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
return getTileShape(op->getOpResult(0));
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index bc309c9029878..dafeb04157439 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -348,10 +348,6 @@ class LayoutInfoPropagation
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
- void visitCreateDescOp(xegpu::CreateDescOp createDesc,
- ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results);
-
void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
@@ -442,9 +438,6 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](xegpu::LoadGatherOp loadGatherOp) {
visitLoadGatherOp(loadGatherOp, operands, results);
})
- .Case([&](xegpu::CreateDescOp createDescOp) {
- visitCreateDescOp(createDescOp, operands, results);
- })
.Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) {
visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
})
@@ -1027,22 +1020,6 @@ void LayoutInfoPropagation::visitLoadGatherOp(
propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
}
-/// Propagate the layout of the descriptor to the vector offset operand in
-/// CreateDescOp.
-void LayoutInfoPropagation::visitCreateDescOp(
- xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
- ArrayRef<const LayoutInfoLattice *> results) {
- LayoutInfo descLayout = results[0]->getValue();
- // Need the layout of the descriptor to propagate to the operands.
- if (!descLayout.isAssigned())
- return;
- auto uArch = getUArch(getChipStr(createDesc).value_or(""));
- // For offset operand propagate 1D default layout.
- LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
- uArch->getSubgroupSize());
- propagateIfChanged(operands[1], operands[1]->meet(layout));
-}
-
/// Set the layout for the value, tensor descriptor, offset and mask operands in
/// the StoreScatterOp.
void LayoutInfoPropagation::visitStoreScatterOp(
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 2b1bd4d73a576..d49d797581a5e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -477,74 +477,6 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
}
};
-struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
- using UnrollPattern<xegpu::CreateDescOp>::UnrollPattern;
- LogicalResult matchAndRewrite(xegpu::CreateDescOp op,
- PatternRewriter &rewriter) const override {
- Location loc = op.getLoc();
- xegpu::TensorDescType tdescTy = op.getType();
- TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
- VectorType indiceVecTy = indiceVec.getType();
-
- if (!tdescTy.isScattered())
- return failure();
-
- std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
- if (!targetShape)
- return failure();
-
- SmallVector<int64_t> targetIndiceShape(*targetShape);
- int64_t originalChunkSize = tdescTy.getChunkSizeAsInt();
- // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1.
- if (originalChunkSize > 1)
- targetIndiceShape.pop_back();
-
- auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
- SmallVector<Type> convertedIndiceTypes =
- getUnrolledTypes(indiceVecTy, targetIndiceShape);
- SmallVector<Value> convertedIndiceVec =
- pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter);
-
- SmallVector<Value> newOps;
-
- // More indices is need when chunkSize > 1. Since a big load from one
- // address could be break into multiple small loads.
- if (originalChunkSize > 1) {
- int64_t blockedChunkSize = targetShape->back();
- int64_t numNewChunks = originalChunkSize / blockedChunkSize;
-
- for (auto [indice, indiceType] :
- llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
- for (int64_t i = 0; i < numNewChunks; ++i) {
- // Compute the offset
- Value inc = arith::ConstantIndexOp::create(rewriter, loc,
- i * blockedChunkSize);
- Value incVec =
- vector::BroadcastOp::create(rewriter, loc, indiceType, inc);
- Value offsetIndice =
- arith::AddIOp::create(rewriter, loc, indice, incVec);
-
- auto newOp = xegpu::CreateDescOp::create(
- rewriter, loc, newTdescTy, op.getSource(), offsetIndice);
-
- newOps.push_back(newOp);
- }
- }
- } else {
- for (auto indice : convertedIndiceVec) {
- auto newOp = xegpu::CreateDescOp::create(rewriter, loc, newTdescTy,
- op.getSource(), indice);
- ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/182804
More information about the Mlir-commits
mailing list