[Mlir-commits] [mlir] [MLIR][XeGPU] Update XeGPU create_tdesc, update_offset, load, store and prefetch. (PR #154653)
Sang Ik Lee
llvmlistbot at llvm.org
Thu Aug 21 10:52:31 PDT 2025
https://github.com/silee2 updated https://github.com/llvm/llvm-project/pull/154653
>From 4e9d25f308ecd8a7a45353645121d2db85317fd3 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 20 Aug 2025 20:24:17 +0000
Subject: [PATCH 1/8] Move create_tdesc addr shape restriction to .td, match
pointer type for scatter ops with other xegpu pointer usage.
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 5 +++--
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 6 ++++--
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 4 ----
mlir/test/Dialect/XeGPU/invalid.mlir | 6 +++---
4 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index eb54d6887681d..ac11210c6d0c2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -500,7 +500,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
(scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
It accepts the following parameters:
- * source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
+ * source: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
+ memory object.
* offsets: a vector containing offsets of each access point. Its size
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
implying each element in the vector corresponds to a work-item (SIMT lane)
@@ -536,7 +537,7 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
```
}];
- let arguments = (ins XeGPU_BaseAddrType: $source,
+ let arguments = (ins XeGPU_GatherScatterBaseAddrType: $source,
XeGPU_OffsetType: $offsets);
let results = (outs XeGPU_TensorDesc:$TensorDesc);
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f8b371db498e8..53ecedab5406d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -16,13 +16,15 @@ include "mlir/IR/BuiltinTypes.td"
def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
-def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64, UI32, I64, I32]>;
+def XeGPU_PointerType: AnyTypeOf<[UI64, UI32, I64, I32]>;
+def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, XeGPU_PointerType]>;
def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
+def XeGPU_GatherScatterBaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
@@ -189,7 +191,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
let genVerifyDecl = 1;
}
-def XeGPU_GatherScatterSourceType : AnyTypeOf<[XeGPU_TensorDesc,Non0RankedMemRefOf<[XeGPU_ScalarType]>, UI64]>;
+def XeGPU_GatherScatterSourceType : AnyTypeOf<[XeGPU_TensorDesc,XeGPU_GatherScatterBaseAddrType]>;
def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier.";
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 906c71d8b8dad..4e6be230e1e87 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -685,10 +685,6 @@ void CreateDescOp::build(OpBuilder &builder, OperationState &state,
LogicalResult CreateDescOp::verify() {
auto tdescTy = getTensorDescType();
- if (getRankOf(getSource()) > 1)
- return emitOpError(
- "Expecting the source is a 1D memref or pointer (uint64_t).");
-
if (!tdescTy.isScattered())
return emitOpError("Expects a scattered TensorDesc.\n");
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 93a5a055b08c6..5d86dbf81e48f 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -387,7 +387,7 @@ func.func @load_gather_vc_3(%src: ui64) {
// -----
func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) {
%offsets = arith.constant dense<[0]> : vector<1xindex>
- // expected-error at +1 {{Expecting the source is a 1D memref or pointer}}
+ // expected-error at +1 {{op operand #0 must be TensorDesc describing regions of interested data}}
xegpu.prefetch %src[%offsets]: memref<4x4xf32>, vector<1xindex>
return
}
@@ -428,7 +428,7 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) {
%val = arith.constant dense<2.9>: vector<4xf16>
%offsets = arith.constant dense<[0]> : vector<1xindex>
%mask = arith.constant dense<1>: vector<1xi1>
- // expected-error at +1 {{Expecting the dest is a 1D memref or pointer}}
+ // expected-error at +1 {{op operand #1 must be TensorDesc describing regions of interested data}}
xegpu.store %val, %src[%offsets], %mask
: vector<4xf16>, memref<4x4xf16>, vector<1xindex>, vector<1xi1>
return
@@ -447,7 +447,7 @@ func.func @load_gather_offset_wi_2(%src: ui64) {
func.func @load_gather_offset_wi_1(%src: memref<4x4xf32>) {
%mask = arith.constant dense<1>: vector<1xi1>
%offsets = arith.constant dense<[0]> : vector<1xindex>
- // expected-error at +1 {{Expecting the source is a 1D memref or pointer}}
+ // expected-error at +1 {{op operand #0 must be TensorDesc describing regions of interested data}}
%2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : memref<4x4xf32>, vector<1xindex>, vector<1xi1> -> vector<2xf32>
return
}
>From 2e985ce836322927ae7c63a2db5f503636c8c645 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 20 Aug 2025 21:22:58 +0000
Subject: [PATCH 2/8] Update xegpu op descriptions.
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 83 ++++++++++++++-----
1 file changed, 60 insertions(+), 23 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index ac11210c6d0c2..0f2a13e1ae16c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -511,6 +511,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
match the dimension of offsets. It may also has a second dimension corresponding to
the chunk_size if the chunk size is larger than 1.
+ This op is not available in SIMT mode.
+
Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
```mlir
%a = memref.alloc() : memref<1024xf32>
@@ -618,6 +620,15 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
: memref<1024xf32>, vector<4xindex>
```
+ Example 3 (SIMT mode):
+ SIMT mode only accepts the offsets variant.
+ ```mlir
+ xegpu.prefetch %0[%1] {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<cached>,
+ l3_hint = #xegpu.cache_hint<cached>}
+ : memref<256xf32>, vector<1xindex>
+ ```
+
}];
let arguments = (ins XeGPU_GatherScatterSourceType: $source,
@@ -671,8 +682,18 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
The mask operand masks out memory access so that it is safe to pass out-of-boundary
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
- In SIMT mode, the result vector represents the data to be loaded by each work-item.
- Each work-item recieves a `chunk_size` number of elements.
+ In SIMT mode, the result is a 1D vector that represents the data to be loaded by
+ each work-item.
+
+ `source` represents the memory region to be loaded from, which can be either a
+ tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
+ In case of tensor_desc, offsets come from the producer create_tdesc op.
+ tensor_desc cannot be used in SIMT mode.
+ `offsets` represents offsets from source. required if `source` in not a TensorDescType.
+ offsets is a vector of `index` type and vector length is either the subgroup size
+ or 1 in SIMT mode.
+ `mask` is a vector of `i1` type, which is used to mask out the memory access.
+ mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
Example 1:
```mlir
@@ -692,16 +713,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
vector<16xi1> -> vector<16x8xf32>
```
- Example 3 (SIMT mode):
- ```mlir
- %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>
- vector<16xi1> -> vector<8xf32>
- ```
-
- Example 4:
+ Example 3:
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
The source operand could be a raw pointer (uint64_t). Please refer to create_tdesc
@@ -716,6 +728,16 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
: memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
```
+ Example 4 (SIMT mode):
+ SIMT mode only accepts the offsets variant. chunk_size can be inferred from result
+ type. In this example, chunk_size is 8.
+ ```mlir
+ %2 = xegpu.load %1[%2], %0 <{l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<uncached>}>
+ : memref<128xf32>, vector<1xindex>, vector<1xi1> -> vector<8xf32>
+ ```
+
}];
let arguments = (ins XeGPU_GatherScatterSourceType: $source,
@@ -785,8 +807,19 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
introduced on purpose, making sure users are aware of this implicit transformation.
- In SIMT mode, the input vector represents the data to be stored by each work-item.
- Each work-item stores a `chunk_size` number of elements.
+ In SIMT mode, the result is a 1D vector that represents the data to be stored by
+ each work-item.
+
+ `value` represents the data to be stored.
+ `dest` represents the memory region to be stored to, which can be either a
+ tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
+ In case of tensor_desc, offsets come from the producer create_tdesc op.
+ tensor_desc cannot be used in SIMT mode.
+ `offsets` represents offsets from dest. required if `source` in not a TensorDescType.
+ offsets is a vector of `index` type and vector length is either the subgroup size
+ or 1 in SIMT mode.
+ `mask` is a vector of `i1` type, which is used to mask out the memory access.
+ mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
Example 1:
```mlir
@@ -804,15 +837,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
: vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
```
- Example 3 (SIMT mode):
- ```mlir
- xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>}>
- : vector<8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>> vector<16xi1>
- ```
-
- Example 4:
+ Example 3:
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc".
The dest operand could be a raw pointer (uint64_t).
@@ -828,6 +853,16 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
: memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32>
```
+ Example 4 (SIMT mode):
+ SIMT mode only accepts the offsets variant. chunk_size can be inferred from value
+ type. In this example, chunk_size is 8.
+ ```mlir
+ xegpu.store %0, %1[%2], %3 <{l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}>
+ : vector<8xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
+ ```
+
}];
let arguments = (ins
@@ -896,6 +931,8 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
update the offset per work-item, so its offsets contains values representing
shifts for each work-item.
+ This op is not available in SIMT mode.
+
Example:
```mlir
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
>From 46857e36af6725c8eaff25e584f4e71bc7361949 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 20 Aug 2025 21:33:49 +0000
Subject: [PATCH 3/8] Update op validation.
---
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 37 +++++++++++++-------------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4e6be230e1e87..cf5da7a416846 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -58,13 +58,6 @@ static SmallVector<int64_t> getShapeOf(Type type) {
return shape;
}
-static int64_t getRankOf(Value val) {
- auto type = val.getType();
- if (auto ty = llvm::dyn_cast<ShapedType>(type))
- return ty.getRank();
- return 0;
-}
-
static bool isReadHintOrNone(const CachePolicyAttr &attr) {
if (!attr)
return true;
@@ -719,13 +712,15 @@ LogicalResult CreateDescOp::verify() {
LogicalResult PrefetchOp::verify() {
auto tdescTy = getTensorDescType();
+ if (!tdescTy && !getOffsets())
+ return emitOpError("Expects offsets.");
+
+ if (tdescTy && getOffsets())
+ return emitOpError("offsets not allowed.");
+
if (tdescTy && !tdescTy.isScattered())
return emitOpError("Expects a scattered TensorDesc.");
- if (!tdescTy && getRankOf(getSource()) > 1)
- return emitOpError(
- "Expecting the source is a 1D memref or pointer (uint64_t).");
-
if (!isReadHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
@@ -753,13 +748,15 @@ LogicalResult LoadGatherOp::verify() {
auto maskTy = getMaskType();
auto valueTy = getValueType();
+ if (!tdescTy && !getOffsets())
+ return emitOpError("Expects offsets.");
+
+ if (tdescTy && getOffsets())
+ return emitOpError("offsets not allowed.");
+
if (tdescTy && !tdescTy.isScattered())
return emitOpError("Expects a scattered TensorDesc.");
- if (!tdescTy && getRankOf(getSource()) > 1)
- return emitOpError(
- "Expecting the source is a 1D memref or pointer (uint64_t).");
-
if (!isReadHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
@@ -800,13 +797,15 @@ LogicalResult StoreScatterOp::verify() {
auto maskTy = getMaskType();
auto valueTy = getValueType();
+ if (!tdescTy && !getOffsets())
+ return emitOpError("Expects offsets.");
+
+ if (tdescTy && getOffsets())
+ return emitOpError("offsets not allowed.");
+
if (tdescTy && !tdescTy.isScattered())
return emitOpError("Expects a scattered TensorDesc.");
- if (!tdescTy && getRankOf(getDest()) > 1)
- return emitOpError(
- "Expecting the dest is a 1D memref or pointer (uint64_t).");
-
if (!isWriteHintOrNone(getL1HintAttr()))
return emitOpError("invalid l1_hint: ") << getL1HintAttr();
>From 1077b3abebc3fdc0cd97a6ddee3be845b449d71c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 20 Aug 2025 22:57:58 +0000
Subject: [PATCH 4/8] Update op descriptions
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 0f2a13e1ae16c..8fd04a5d4cdcf 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -683,7 +683,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
In SIMT mode, the result is a 1D vector that represents the data to be loaded by
- each work-item.
+ each work-item. If size is not 1, size should be equal to the chunk size,
`source` represents the memory region to be loaded from, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
@@ -694,6 +694,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
or 1 in SIMT mode.
`mask` is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
+ `chunk_size` (optional) represents contiguous number of elements to load from per work item.
Example 1:
```mlir
@@ -808,7 +809,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
introduced on purpose, making sure users are aware of this implicit transformation.
In SIMT mode, the result is a 1D vector that represents the data to be stored by
- each work-item.
+ each work-item. If size is not 1, size should be equal to the chunk size.
`value` represents the data to be stored.
`dest` represents the memory region to be stored to, which can be either a
@@ -820,6 +821,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
or 1 in SIMT mode.
`mask` is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
+ `chunk_size` (optional) represents contiguous number of elements to store to per work item.
Example 1:
```mlir
>From af57f45e3536a695bd1ef19fdad213b211332e36 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 20 Aug 2025 23:30:31 +0000
Subject: [PATCH 5/8] Add invalid op checks for new op restriction.
---
mlir/test/Dialect/XeGPU/invalid.mlir | 55 ++++++++++++++++++++++++++++
1 file changed, 55 insertions(+)
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 5d86dbf81e48f..c076ac78b9edd 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -392,6 +392,23 @@ func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) {
return
}
+// -----
+func.func @prefetch_offset_wi_2(%src: memref<16xf32>) {
+ %offsets = arith.constant dense<[0]> : vector<1xindex>
+ %1 = xegpu.create_tdesc %src, %offsets : memref<16xf32>, vector<1xindex>
+ -> !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
+ // expected-error at +1 {{offsets not allowed}}
+ xegpu.prefetch %1[%offsets]: !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>>, vector<1xindex>
+ return
+}
+
+// -----
+func.func @prefetch_offset_wi_3(%src: memref<16xf32>) {
+ // expected-error at +1 {{Expects offsets}}
+ xegpu.prefetch %src: memref<16xf32>
+ return
+}
+
// -----
func.func @load_gather_offset_sg(%src: memref<?xf16>) {
%offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
@@ -434,6 +451,44 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) {
return
}
+// -----
+func.func @store_scatter_offset_wi_3(%src: memref<16xf16>) {
+ %val = arith.constant dense<2.9>: vector<1xf16>
+ %mask = arith.constant dense<1>: vector<1xi1>
+ // expected-error at +1 {{Expects offsets}}
+ xegpu.store %val, %src, %mask
+ : vector<1xf16>, memref<16xf16>, vector<1xi1>
+ return
+}
+
+// -----
+func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>) {
+ %val = arith.constant dense<2.9>: vector<1xf16>
+ %offsets = arith.constant dense<[0]> : vector<1xindex>
+ %mask = arith.constant dense<1>: vector<1xi1>
+ // expected-error at +1 {{offsets not allowed}}
+ xegpu.store %val, %src[%offsets], %mask
+ : vector<1xf16>, !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1>
+ return
+}
+
+// -----
+func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>) {
+ %mask = arith.constant dense<1>: vector<1xi1>
+ %offsets = arith.constant dense<[0]> : vector<1xindex>
+ // expected-error at +1 {{offsets not allowed}}
+ %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1> -> vector<2xf16>
+ return
+}
+
+// -----
+func.func @load_gather_offset_wi_3(%src: ui64) {
+ %mask = arith.constant dense<1>: vector<1xi1>
+ // expected-error at +1 {{Expects offsets}}
+ %2 = xegpu.load %src, %mask <{chunk_size = 2}> : ui64, vector<1xi1> -> vector<2xf16>
+ return
+}
+
// -----
func.func @load_gather_offset_wi_2(%src: ui64) {
%mask = arith.constant dense<1>: vector<1xi1>
>From daf18393e466df4d2a2d42d3ac6955d0a9cd812c Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 21 Aug 2025 16:39:22 +0000
Subject: [PATCH 6/8] Allow scalar offset for SIMT mode gather / scatter /
prefetch ops.
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 8fd04a5d4cdcf..1d4f89b108e52 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -609,7 +609,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
Example 2:
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc".
- The source operand could be a raw pointer (uint64_t).
+ The source operand could be a raw pointer (ui64, ui32, i64, i32).
Please refer to create_tdesc for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
@@ -632,7 +632,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
}];
let arguments = (ins XeGPU_GatherScatterSourceType: $source,
- Optional<XeGPU_OffsetType>: $offsets,
+ Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>: $offsets,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -742,7 +742,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
}];
let arguments = (ins XeGPU_GatherScatterSourceType: $source,
- Optional<XeGPU_OffsetType>: $offsets,
+ Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>: $offsets,
XeGPU_MaskType: $mask,
OptionalAttr<I64Attr>: $chunk_size,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
@@ -870,7 +870,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
let arguments = (ins
XeGPU_ValueType: $value,
XeGPU_GatherScatterSourceType: $dest,
- Optional<XeGPU_OffsetType>: $offsets,
+ Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>: $offsets,
XeGPU_MaskType: $mask,
OptionalAttr<I64Attr>: $chunk_size,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
>From 98f2caaf767efa37ae5b67945836f01037c065f7 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 21 Aug 2025 17:09:07 +0000
Subject: [PATCH 7/8] Update op description.
---
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 108 +++++++++++-------
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 11 +-
2 files changed, 73 insertions(+), 46 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 1d4f89b108e52..bf27bbc85a1f9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -70,28 +70,32 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
future). Elements in the subview continuous in each dimension. It encodes the
following important information for supporting Intel hardware features:
- * source: an object representing (starting address/pointer of) a memory region.
+ Arguments:
+ - `source`: an object representing (starting address/pointer of) a memory region.
It can be either a memref object, or simply a pointer represented by uint64_t type.
For the case of dynamic memrefs or pointer, the shape and layout information of the
memory region should be explicitly passed via `shape` and `strides` parameters.
- * offsets: index values represents offsets from the "source" at the each dimension
+ - `offsets`: index values represents offsets from the "source" at the each dimension
at which the subview of the target memory will be created. It is encoded via
"offsets" and "const_offsets", such that it can accept various forms, such as,
operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
- * shape: the shape information of the memory region pointed by the "source". It is
+ - `shape`: the shape information of the memory region pointed by the "source". It is
typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
But if "source" is simply a pointer represented as uint64_t type, or a memref
type without shape information e.g., memref<?x?xf16>, the shape information has
to be explicitly passed via the "shape" and "const_shape" arguments.
- * strides: the strides of the memory region pointed by the "source". Similar to shape,
+ - `strides`: the strides of the memory region pointed by the "source". Similar to shape,
it is typically encoded via the MemRefType of the source too. But if "source" is
simply a pointer represented as uint64_t type, or a memref type without shape
information e.g., memref<?x?xf16>, the strides information has to be explicitly
passed via the "strides" and "const_strides" argument.
+ Results:
+ - `res`: nd tensor descriptor
+
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
```mlir
%0 = memref.alloc() : memref<1024x1024xf32>
@@ -500,13 +504,17 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
(scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
It accepts the following parameters:
- * source: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
+ Arguments:
+ - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened
memory object.
- * offsets: a vector containing offsets of each access point. Its size
+ - `offsets`: a vector containing offsets of each access point. Its size
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
implying each element in the vector corresponds to a work-item (SIMT lane)
in the subgroup.
+ Results:
+ - `res`: scattered tensor descriptor
+
The first dimension of the result TensorDesc corresponds to work-items, so it should
match the dimension of offsets. It may also has a second dimension corresponding to
the chunk_size if the chunk size is larger than 1.
@@ -539,8 +547,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
```
}];
- let arguments = (ins XeGPU_GatherScatterBaseAddrType: $source,
- XeGPU_OffsetType: $offsets);
+ let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source,
+ XeGPU_OffsetType:$offsets);
let results = (outs XeGPU_TensorDesc:$TensorDesc);
let builders = [
@@ -598,6 +606,16 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
As compared to prefetch_nd, which works on non-scattered TensorDesc,
it works on scattered TensorDesc instead.
+ Arguments:
+ - `source`: represents the memory region to be loaded from, which can be either a
+ tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
+ In case of tensor_desc, offsets come from the producer create_tdesc op.
+ tensor_desc cannot be used in SIMT mode.
+ - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
+ offsets is a vector of `index` type and vector length is either the subgroup size
+ or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+ - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
+
Example 1:
```mlir
xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
@@ -631,11 +649,11 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
}];
- let arguments = (ins XeGPU_GatherScatterSourceType: $source,
- Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>: $offsets,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ let arguments = (ins XeGPU_GatherScatterSourceType:$source,
+ Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
let extraClassDeclaration = extraBaseClassDeclaration # [{
Type getSourceType() {
@@ -685,16 +703,22 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
In SIMT mode, the result is a 1D vector that represents the data to be loaded by
each work-item. If size is not 1, size should be equal to the chunk size,
- `source` represents the memory region to be loaded from, which can be either a
+ Arguments:
+ - `source`: represents the memory region to be loaded from, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
In case of tensor_desc, offsets come from the producer create_tdesc op.
tensor_desc cannot be used in SIMT mode.
- `offsets` represents offsets from source. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from source. required if `source` in not a TensorDescType.
offsets is a vector of `index` type and vector length is either the subgroup size
- or 1 in SIMT mode.
- `mask` is a vector of `i1` type, which is used to mask out the memory access.
+ or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+ - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
- `chunk_size` (optional) represents contiguous number of elements to load from per work item.
+ - `chunk_size`: (optional) represents contiguous number of elements to load from per work item.
+ - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
+
+ Results:
+ - `res`: represents loaded data
+
Example 1:
```mlir
@@ -717,7 +741,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
Example 3:
A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc.
It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc".
- The source operand could be a raw pointer (uint64_t). Please refer to create_tdesc
+ The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc
for the restriction of memref.
```mlir
%a = memref.alloc() : memref<1024xf32>
@@ -741,13 +765,12 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
}];
- let arguments = (ins XeGPU_GatherScatterSourceType: $source,
- Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>: $offsets,
- XeGPU_MaskType: $mask,
- OptionalAttr<I64Attr>: $chunk_size,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ let arguments = (ins XeGPU_GatherScatterSourceType:$source,
+ Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
+ XeGPU_MaskType:$mask, OptionalAttr<I64Attr>:$chunk_size,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
let results = (outs XeGPU_ValueType: $value);
let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -801,7 +824,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
let summary = "store data to scattered memory locations.";
- let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+ let description =
+ [{ It (aka. store) stores data to scattered memory locations. The value is
typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
@@ -811,17 +835,19 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
In SIMT mode, the result is a 1D vector that represents the data to be stored by
each work-item. If size is not 1, size should be equal to the chunk size.
- `value` represents the data to be stored.
- `dest` represents the memory region to be stored to, which can be either a
+ Arguments:
+ - `value`: represents the data to be stored.
+ - `dest`: represents the memory region to be stored to, which can be either a
tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32).
In case of tensor_desc, offsets come from the producer create_tdesc op.
tensor_desc cannot be used in SIMT mode.
- `offsets` represents offsets from dest. required if `source` in not a TensorDescType.
+ - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType.
offsets is a vector of `index` type and vector length is either the subgroup size
- or 1 in SIMT mode.
- `mask` is a vector of `i1` type, which is used to mask out the memory access.
+ or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
+ - `mask`: is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
- `chunk_size` (optional) represents contiguous number of elements to store to per work item.
+ - `chunk_size`: (optional) represents contiguous number of elements to store to per work item.
+ - `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
Example 1:
```mlir
@@ -867,15 +893,13 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
}];
- let arguments = (ins
- XeGPU_ValueType: $value,
- XeGPU_GatherScatterSourceType: $dest,
- Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>: $offsets,
- XeGPU_MaskType: $mask,
- OptionalAttr<I64Attr>: $chunk_size,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ let arguments = (ins XeGPU_ValueType:$value,
+ XeGPU_GatherScatterSourceType:$dest,
+ Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
+ XeGPU_MaskType:$mask, OptionalAttr<I64Attr>:$chunk_size,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
let extraClassDeclaration = extraBaseClassDeclaration # [{
Type getDestType() {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 53ecedab5406d..84902b2039643 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -16,15 +16,17 @@ include "mlir/IR/BuiltinTypes.td"
def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
-def XeGPU_PointerType: AnyTypeOf<[UI64, UI32, I64, I32]>;
-def XeGPU_BaseAddrType: AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, XeGPU_PointerType]>;
+def XeGPU_PointerType : AnyTypeOf<[UI64, UI32, I64, I32]>;
+def XeGPU_BaseAddrType
+ : AnyTypeOf<[Non0RankedMemRefOf<[XeGPU_ScalarType]>, XeGPU_PointerType]>;
def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
-def XeGPU_GatherScatterBaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
+def XeGPU_GatherScatterBaseAddrType
+ : AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
@@ -191,7 +193,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
let genVerifyDecl = 1;
}
-def XeGPU_GatherScatterSourceType : AnyTypeOf<[XeGPU_TensorDesc,XeGPU_GatherScatterBaseAddrType]>;
+def XeGPU_GatherScatterSourceType
+ : AnyTypeOf<[XeGPU_TensorDesc, XeGPU_GatherScatterBaseAddrType]>;
def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier.";
>From 68492e2091b200c121c874ef9ae7acde7b6cf4b1 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Thu, 21 Aug 2025 17:52:04 +0000
Subject: [PATCH 8/8] Allow create_tdesc and update_offset in SIMT mode. Allow
scalar mask for load/store/prefetch op in SIMT mode.
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index bf27bbc85a1f9..b8d706bd9e6cb 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -519,8 +519,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
match the dimension of offsets. It may also has a second dimension corresponding to
the chunk_size if the chunk size is larger than 1.
- This op is not available in SIMT mode.
-
Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
```mlir
%a = memref.alloc() : memref<1024xf32>
@@ -713,6 +711,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
- `mask`: is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
+ scalar mask is also valid for SIMT mode.
- `chunk_size`: (optional) represents contiguous number of elements to load from per work item.
- `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
@@ -767,7 +766,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
let arguments = (ins XeGPU_GatherScatterSourceType:$source,
Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
- XeGPU_MaskType:$mask, OptionalAttr<I64Attr>:$chunk_size,
+ AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
@@ -846,6 +845,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
or 1 in SIMT mode. scalar offset is also valid for SIMT mode.
- `mask`: is a vector of `i1` type, which is used to mask out the memory access.
mask is a vector of size equal to the subgroup size, or 1 in SIMT mode.
+ scalar mask is also valid for SIMT mode.
- `chunk_size`: (optional) represents contiguous number of elements to store to per work item.
- `l1_hint`, `l2_hint`, `l3_hint`: are optional cache hints for each level of cache.
@@ -896,7 +896,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
let arguments = (ins XeGPU_ValueType:$value,
XeGPU_GatherScatterSourceType:$dest,
Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
- XeGPU_MaskType:$mask, OptionalAttr<I64Attr>:$chunk_size,
+ AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
@@ -957,8 +957,6 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
update the offset per work-item, so its offsets contains values representing
shifts for each work-item.
- This op is not available in SIMT mode.
-
Example:
```mlir
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
More information about the Mlir-commits
mailing list