[Mlir-commits] [mlir] [MLIR][Conversion] XeGPU to XeVM: create_nd_tdesc - Add support for base memory rank > 2 (PR #164701)
Sang Ik Lee
llvmlistbot at llvm.org
Fri Oct 24 09:45:56 PDT 2025
https://github.com/silee2 updated https://github.com/llvm/llvm-project/pull/164701
>From 4a92953e944f31c80b047c0e90f050cbb4d983e9 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Tue, 21 Oct 2025 22:13:38 +0000
Subject: [PATCH 01/10] [MLIR][Conversion] XeGPU to XeVM: Create nd tensor
descriptor payload for base memory rank > 2
---
.../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 82 +++++++++++++++----
1 file changed, 66 insertions(+), 16 deletions(-)
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index fcbf66dbe9e45..bb56c096879a3 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -151,6 +151,21 @@ translateStoreXeGPUCacheHint(std::optional<xegpu::CachePolicy> L1hint,
}
}
+// Compute the product of sizes in the range [lo, hi) from the sizes array.
+static Value getProductOfSizes(ConversionPatternRewriter &rewriter,
+ Location loc, ArrayRef<OpFoldResult> sizes,
+ size_t lo, size_t hi) {
+ Type indexTy = rewriter.getIndexType();
+ Value product = arith::ConstantIndexOp::create(rewriter, loc, 1);
+ for (size_t idx = lo; idx < hi; idx++) {
+ OpFoldResult ofr = sizes[idx];
+ Value sizeVal = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
+ sizeVal = getValueOrCreateCastToIndexLike(rewriter, loc, indexTy, sizeVal);
+ product = rewriter.createOrFold<arith::MulIOp>(loc, product, sizeVal);
+ }
+ return product;
+}
+
class CreateNdDescToXeVMPattern
: public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern::OpConversionPattern;
@@ -184,10 +199,9 @@ class CreateNdDescToXeVMPattern
// Source can be a memref or a pointer (ui64, ui32, i64 or i32).
SmallVector<OpFoldResult> mixedSizes = op.getMixedSizes();
- // Descriptor shape is expected to be 2D.
- int64_t rank = mixedSizes.size();
- if (rank != 2)
- return rewriter.notifyMatchFailure(op, "Expected 2D shape.");
+ auto srcRank = mixedSizes.size();
+ if (srcRank < 2)
+ return rewriter.notifyMatchFailure(op, "Expected at least 2D source.");
auto sourceTy = source.getType();
auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
@@ -203,9 +217,8 @@ class CreateNdDescToXeVMPattern
baseAddr = adaptor.getSource();
}
// Utility for creating offset values from op fold result.
- auto createOffset = [&](SmallVector<OpFoldResult> &ofrVec,
- unsigned idx) -> Value {
- Value val = getValueOrCreateConstantIntOp(rewriter, loc, ofrVec[idx]);
+ auto createOffset = [&](OpFoldResult ofr) -> Value {
+ Value val = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
val = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy, val);
return val;
};
@@ -213,8 +226,14 @@ class CreateNdDescToXeVMPattern
offsetW = arith::ConstantIntOp::create(rewriter, loc, payloadElemTy, 0);
offsetH = arith::ConstantIntOp::create(rewriter, loc, payloadElemTy, 0);
// Get shape values from op fold results.
- baseShapeW = createOffset(mixedSizes, 1);
- baseShapeH = createOffset(mixedSizes, 0);
+ baseShapeW = createOffset(mixedSizes[srcRank - 1]);
+ if (srcRank == 2) {
+ baseShapeH = createOffset(mixedSizes[0]);
+ } else {
+ // Generate compute chain for height (product of sizes of all but the last
+ // dimension).
+ baseShapeH = getProductOfSizes(rewriter, loc, mixedSizes, 0, srcRank - 1);
+ }
if (sourceMemrefTy) {
// Cast index to i64.
baseAddr = arith::IndexCastUIOp::create(rewriter, loc, i64Ty, baseAddr);
@@ -255,10 +274,18 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
LogicalResult
matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ auto tdVal = op.getTensorDesc();
+ xegpu::CreateNdDescOp descOp =
+ tdVal.template getDefiningOp<xegpu::CreateNdDescOp>();
+ auto mixedStrides = descOp.getMixedStrides();
auto mixedOffsets = op.getMixedOffsets();
- int64_t opOffsetsSize = mixedOffsets.size();
- if (opOffsetsSize != 2)
- return rewriter.notifyMatchFailure(op, "Expected 2D offsets.");
+ auto mixedSizes = descOp.getMixedSizes();
+ size_t opOffsetsSize = mixedOffsets.size();
+ if (opOffsetsSize != mixedStrides.size())
+ return rewriter.notifyMatchFailure(
+ op, "Offsets size should match base memory rank.");
+ if (opOffsetsSize < 2)
+ return rewriter.notifyMatchFailure(op, "Expected at least 2D offset.");
auto loc = op.getLoc();
auto ctxt = rewriter.getContext();
@@ -283,12 +310,35 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeH));
// Offsets are provided by the op.
// convert them to i32.
- Value offsetW =
- getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[1]);
+ // Offset computation assumes base memory layout is row major.
+ Value offsetW = getValueOrCreateConstantIntOp(
+ rewriter, loc, mixedOffsets[opOffsetsSize - 1]);
offsetW = getValueOrCreateCastToIndexLike(rewriter, loc,
rewriter.getI32Type(), offsetW);
- Value offsetH =
- getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[0]);
+ Value offsetH;
+ if (opOffsetsSize == 2)
+ offsetH = getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[0]);
+ else {
+ offsetH = arith::ConstantIndexOp::create(rewriter, loc, 0);
+ Value tmpStride = arith::ConstantIndexOp::create(rewriter, loc, 1);
+ // offsetH requires computing the linear offset using the strides.
+ for (size_t idx = 0; idx < opOffsetsSize - 1; idx++) {
+ size_t revIdx = opOffsetsSize - 2 - idx;
+ Value offsetVal =
+ getValueOrCreateConstantIntOp(rewriter, loc, mixedOffsets[revIdx]);
+ offsetVal = getValueOrCreateCastToIndexLike(
+ rewriter, loc, rewriter.getIndexType(), offsetVal);
+ Value mul =
+ rewriter.createOrFold<arith::MulIOp>(loc, tmpStride, offsetVal);
+ Value dimSize =
+ getValueOrCreateConstantIntOp(rewriter, loc, mixedSizes[revIdx]);
+ dimSize = getValueOrCreateCastToIndexLike(
+ rewriter, loc, rewriter.getIndexType(), dimSize);
+ tmpStride =
+ rewriter.createOrFold<arith::MulIOp>(loc, tmpStride, dimSize);
+ offsetH = rewriter.createOrFold<arith::AddIOp>(loc, offsetH, mul);
+ }
+ }
offsetH = getValueOrCreateCastToIndexLike(rewriter, loc,
rewriter.getI32Type(), offsetH);
// Get address space from tensor descriptor memory space.
>From e510643ae646688ce0c7175a7459ce1000bd4523 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Tue, 21 Oct 2025 23:00:44 +0000
Subject: [PATCH 02/10] Fix bugs and add test case for high rank base memref.
---
.../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 8 +++---
.../loadstore_nd_high_base_rank.mlir | 25 +++++++++++++++++++
2 files changed, 30 insertions(+), 3 deletions(-)
create mode 100644 mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index bb56c096879a3..a048fc0d091e6 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -152,15 +152,15 @@ translateStoreXeGPUCacheHint(std::optional<xegpu::CachePolicy> L1hint,
}
// Compute the product of sizes in the range [lo, hi) from the sizes array.
+// Note: all sizes are i64.
static Value getProductOfSizes(ConversionPatternRewriter &rewriter,
Location loc, ArrayRef<OpFoldResult> sizes,
size_t lo, size_t hi) {
- Type indexTy = rewriter.getIndexType();
- Value product = arith::ConstantIndexOp::create(rewriter, loc, 1);
+ Value product =
+ arith::ConstantIntOp::create(rewriter, loc, rewriter.getI64Type(), 1);
for (size_t idx = lo; idx < hi; idx++) {
OpFoldResult ofr = sizes[idx];
Value sizeVal = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
- sizeVal = getValueOrCreateCastToIndexLike(rewriter, loc, indexTy, sizeVal);
product = rewriter.createOrFold<arith::MulIOp>(loc, product, sizeVal);
}
return product;
@@ -233,6 +233,8 @@ class CreateNdDescToXeVMPattern
// Generate compute chain for height (product of sizes of all but the last
// dimension).
baseShapeH = getProductOfSizes(rewriter, loc, mixedSizes, 0, srcRank - 1);
+ baseShapeH = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy,
+ baseShapeH);
}
if (sourceMemrefTy) {
// Cast index to i64.
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
new file mode 100644
index 0000000000000..60d1dcf18d634
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
+
+gpu.module @load_store_check {
+ // CHECK: fail
+ gpu.func @load_store(%src: memref<3x3x8x16xf32, 1>, %dst: memref<3x3x8x16xf32, 1>) kernel {
+ %srcce = memref.memory_space_cast %src : memref<3x3x8x16xf32, 1> to memref<3x3x8x16xf32>
+ %dstte = memref.memory_space_cast %dst : memref<3x3x8x16xf32, 1> to memref<3x3x8x16xf32>
+
+ %src_tdesc = xegpu.create_nd_tdesc %srcce : memref<3x3x8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ %loaded = xegpu.load_nd %src_tdesc[2, 2, 0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+
+ %tid_x = gpu.thread_id x
+ %tid_x_i32 = arith.index_cast %tid_x : index to i32
+ %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
+ %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
+
+ %dst_tdesc = xegpu.create_nd_tdesc %dstte : memref<3x3x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+
+ xegpu.store_nd %loaded_modified, %dst_tdesc[1, 1, 0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+ gpu.return
+ }
+}
>From 4e4cbd06de5cebb304cbc2ef64421c52a20b24ac Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 00:27:09 +0000
Subject: [PATCH 03/10] Replace 2D block load payload with i64.
---
.../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 131 ++++--------------
.../Conversion/XeGPUToXeVM/loadstore_nd.mlir | 2 +-
.../loadstore_nd_high_base_rank.mlir | 2 +-
.../XeGPUToXeVM/loadstore_nd_int_addr.mlir | 75 ++++++++++
.../Conversion/XeGPUToXeVM/prefetch_nd.mlir | 6 +-
5 files changed, 106 insertions(+), 110 deletions(-)
create mode 100644 mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index a048fc0d091e6..d7db65d662faf 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -48,15 +48,6 @@ namespace {
static constexpr int32_t systolicDepth{8};
static constexpr int32_t executionSize{16};
-// Offsets to individual fields of the 8xi32 layout nd tensor descriptor.
-enum class NdTdescOffset : uint32_t {
- BasePtr = 0, // Base pointer (i64)
- BaseShapeW = 2, // Base shape width (i32)
- BaseShapeH = 3, // Base shape height (i32)
- TensorOffsetW = 4, // Tensor offset W (i32)
- TensorOffsetH = 5 // Tensor offset H (i32)
-};
-
static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) {
switch (xeGpuMemspace) {
case xegpu::MemorySpace::Global:
@@ -177,92 +168,14 @@ class CreateNdDescToXeVMPattern
if (mixedOffsets.size() != 0)
return rewriter.notifyMatchFailure(op, "Offsets not supported.");
auto loc = op.getLoc();
- auto source = op.getSource();
- // Op is lowered to a code sequence that populates payload.
- // Payload is a 8xi32 vector. Offset to individual fields are defined in
- // NdTdescOffset enum.
- Type payloadElemTy = rewriter.getI32Type();
- VectorType payloadTy = VectorType::get(8, payloadElemTy);
- Type i64Ty = rewriter.getI64Type();
- // 4xi64 view is used for inserting the base pointer.
- VectorType payloadI64Ty = VectorType::get(4, i64Ty);
- // Initialize payload to zero.
- Value payload = arith::ConstantOp::create(
- rewriter, loc,
- DenseElementsAttr::get(payloadTy, IntegerAttr::get(payloadElemTy, 0)));
-
- Value baseAddr;
- Value baseShapeW;
- Value baseShapeH;
- Value offsetW;
- Value offsetH;
- // Source can be a memref or a pointer (ui64, ui32, i64 or i32).
- SmallVector<OpFoldResult> mixedSizes = op.getMixedSizes();
- auto srcRank = mixedSizes.size();
- if (srcRank < 2)
- return rewriter.notifyMatchFailure(op, "Expected at least 2D source.");
-
- auto sourceTy = source.getType();
- auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
- // If source is a memref, we need to extract the aligned pointer as index.
- // Pointer type is passed as i32 or i64 by type converter.
- if (sourceMemrefTy) {
- if (!sourceMemrefTy.hasStaticShape()) {
- return rewriter.notifyMatchFailure(op, "Expected static memref shape.");
- }
- baseAddr =
- memref::ExtractAlignedPointerAsIndexOp::create(rewriter, loc, source);
- } else {
- baseAddr = adaptor.getSource();
- }
- // Utility for creating offset values from op fold result.
- auto createOffset = [&](OpFoldResult ofr) -> Value {
- Value val = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
- val = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy, val);
- return val;
- };
- // Offsets are not supported (0 is used).
- offsetW = arith::ConstantIntOp::create(rewriter, loc, payloadElemTy, 0);
- offsetH = arith::ConstantIntOp::create(rewriter, loc, payloadElemTy, 0);
- // Get shape values from op fold results.
- baseShapeW = createOffset(mixedSizes[srcRank - 1]);
- if (srcRank == 2) {
- baseShapeH = createOffset(mixedSizes[0]);
- } else {
- // Generate compute chain for height (product of sizes of all but the last
- // dimension).
- baseShapeH = getProductOfSizes(rewriter, loc, mixedSizes, 0, srcRank - 1);
- baseShapeH = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy,
- baseShapeH);
- }
- if (sourceMemrefTy) {
- // Cast index to i64.
- baseAddr = arith::IndexCastUIOp::create(rewriter, loc, i64Ty, baseAddr);
- } else if (baseAddr.getType() != i64Ty) {
+ Value baseAddr = adaptor.getSource();
+ Type i64Ty = rewriter.getI64Type();
+ if (baseAddr.getType() != i64Ty) {
// Pointer type may be i32. Cast to i64 if needed.
baseAddr = arith::ExtUIOp::create(rewriter, loc, i64Ty, baseAddr);
}
- // Populate payload.
- Value payLoadAsI64 =
- vector::BitCastOp::create(rewriter, loc, payloadI64Ty, payload);
- payLoadAsI64 =
- vector::InsertOp::create(rewriter, loc, baseAddr, payLoadAsI64,
- static_cast<int>(NdTdescOffset::BasePtr));
- payload = vector::BitCastOp::create(rewriter, loc, payloadTy, payLoadAsI64);
- payload =
- vector::InsertOp::create(rewriter, loc, baseShapeW, payload,
- static_cast<int>(NdTdescOffset::BaseShapeW));
- payload =
- vector::InsertOp::create(rewriter, loc, baseShapeH, payload,
- static_cast<int>(NdTdescOffset::BaseShapeH));
- payload = vector::InsertOp::create(
- rewriter, loc, offsetW, payload,
- static_cast<int>(NdTdescOffset::TensorOffsetW));
- payload = vector::InsertOp::create(
- rewriter, loc, offsetH, payload,
- static_cast<int>(NdTdescOffset::TensorOffsetH));
- rewriter.replaceOp(op, payload);
+ rewriter.replaceOp(op, baseAddr);
return success();
}
};
@@ -291,7 +204,6 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
auto loc = op.getLoc();
auto ctxt = rewriter.getContext();
- auto tdesc = adaptor.getTensorDesc();
auto tdescTy = op.getTensorDescType();
if (tdescTy.getRank() != 2)
return rewriter.notifyMatchFailure(op, "Expected 2D tensor descriptor.");
@@ -301,15 +213,27 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
return rewriter.notifyMatchFailure(
op, "Expected element type bit width to be multiple of 8.");
- VectorType payloadI64Ty = VectorType::get(4, rewriter.getI64Type());
- Value payLoadAsI64 =
- vector::BitCastOp::create(rewriter, loc, payloadI64Ty, tdesc);
- Value basePtr = vector::ExtractOp::create(
- rewriter, loc, payLoadAsI64, static_cast<int>(NdTdescOffset::BasePtr));
- Value baseShapeW = vector::ExtractOp::create(
- rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeW));
- Value baseShapeH = vector::ExtractOp::create(
- rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeH));
+ Value basePtr = adaptor.getTensorDesc();
+ // Utility for creating offset values from op fold result.
+ Type payloadElemTy = rewriter.getIntegerType(32);
+ auto createOffset = [&](OpFoldResult ofr) -> Value {
+ Value val = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
+ val = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy, val);
+ return val;
+ };
+ auto srcRank = mixedSizes.size();
+ // Get shape values from op fold results.
+ Value baseShapeW = createOffset(mixedSizes[srcRank - 1]);
+ Value baseShapeH;
+ if (srcRank == 2) {
+ baseShapeH = createOffset(mixedSizes[0]);
+ } else {
+ // Generate compute chain for height (product of sizes of all but the last
+ // dimension).
+ baseShapeH = getProductOfSizes(rewriter, loc, mixedSizes, 0, srcRank - 1);
+ baseShapeH = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy,
+ baseShapeH);
+ }
// Offsets are provided by the op.
// convert them to i32.
// Offset computation assumes base memory layout is row major.
@@ -979,10 +903,7 @@ struct ConvertXeGPUToXeVMPass
return VectorType::get(sum, elemType);
});
typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type {
- if (type.isScattered())
- return IntegerType::get(&getContext(), 64);
- auto i32Type = IntegerType::get(&getContext(), 32);
- return VectorType::get(8, i32Type);
+ return IntegerType::get(&getContext(), 64);
});
// Convert MemDescType into flattened MemRefType for SLM
typeConverter.addConversion([&](xegpu::MemDescType type) -> Type {
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
index 4c6bbf25b4728..e4b206842e069 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
gpu.module @load_store_check {
gpu.func @load_store(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
index 60d1dcf18d634..e328517634b03 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
gpu.module @load_store_check {
// CHECK: fail
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
new file mode 100644
index 0000000000000..ca458eaf231c0
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
+
+gpu.module @load_store_check {
+ gpu.func @load_store(%src: ui64, %dst: ui32) kernel {
+ // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
+ // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
+ // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
+ // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32>
+ // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
+ // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
+ // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
+ // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c1 = arith.constant 1 : index
+ %src_tdesc = xegpu.create_nd_tdesc %src, shape:[%c8, %c16], strides:[%c16, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+
+
+ //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
+ //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
+ //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
+ //CHECK: %[[LD_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32>
+ //CHECK: %[[LD_TILE_W64:.*]] = arith.constant 0 : i64
+ //CHECK: %[[LD_TILE_W:.*]] = arith.trunci %[[LD_TILE_W64]] : i64 to i32
+ //CHECK: %[[LD_TILE_H64:.*]] = arith.constant 0 : i64
+ //CHECK: %[[LD_TILE_H:.*]] = arith.trunci %[[LD_TILE_H64]] : i64 to i32
+ //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1>
+ //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
+ //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32
+ //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]],
+ //CHECK-SAME: %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]]
+ //CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
+ //CHECK-SAME: pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false,
+ //CHECK-SAME: v_blocks = 1 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+ %loaded = xegpu.load_nd %src_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+ //CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32>
+
+ %tid_x = gpu.thread_id x
+ %tid_x_i32 = arith.index_cast %tid_x : index to i32
+ %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
+ //CHECK: %[[LOADED_F32_MODIFIED:.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32]] [0] : f32 into vector<8xf32>
+ %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
+
+ // CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
+ // CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
+ // CHECK: %[[DESC_0:.*]] = vector.insert %[[PTR_AS_I64]], %[[CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
+ // CHECK: %[[DESC_1:.*]] = vector.bitcast %[[DESC_0]] : vector<4xi64> to vector<8xi32>
+ // CHECK: %[[DESC_2:.*]] = vector.insert {{.*}}, %[[DESC_1]] [2] : i32 into vector<8xi32>
+ // CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32>
+ // CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32>
+ // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32>
+ %dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%c8, %c16], strides:[%c16, %c1] : ui32 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+
+ //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64>
+ //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64>
+ //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32>
+ //CHECK: %[[BASE_H:.*]] = vector.extract %[[DESC]][3] : i32 from vector<8xi32>
+ //CHECK: %[[TILE_W64:.*]] = arith.constant 0 : i64
+ //CHECK: %[[TILE_W:.*]] = arith.trunci %[[TILE_W64]] : i64 to i32
+ //CHECK: %[[TILE_H64:.*]] = arith.constant 0 : i64
+ //CHECK: %[[TILE_H:.*]] = arith.trunci %[[TILE_H64]] : i64 to i32
+ //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1>
+ //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
+ //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32
+ //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32>
+ //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]],
+ //CHECK-SAME: %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]]
+ //CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
+ //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+ xegpu.store_nd %loaded_modified, %dst_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+ gpu.return
+ }
+}
diff --git a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
index 873478aed57e3..ae87951a33447 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
-gpu.module @fence_check {
- gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
+gpu.module @prefetch_nd_check {
+ gpu.func @prefetch_nd(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
%srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32>
%dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32>
>From 2546a37b2a955eb0a3e6133f458bddc7c6e4b2d0 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 10:41:20 -0700
Subject: [PATCH 04/10] Update test check.
---
.../Conversion/XeGPUToXeVM/loadstore_nd.mlir | 74 ++++++-------------
1 file changed, 23 insertions(+), 51 deletions(-)
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
index e4b206842e069..0764129cfd447 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
@@ -1,73 +1,45 @@
// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
gpu.module @load_store_check {
+ // CHECK-LABEL: gpu.func @load_store
+ // CHECK-SAME: %[[ARG0:.*]]: memref<8x16xf32, 1>, %[[ARG1:.*]]: memref<8x16xf32, 1>
gpu.func @load_store(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
+ // CHECK: %[[C64_i32:.*]] = arith.constant 64 : i32
+ // CHECK: %[[C0_i32:.*]] = arith.constant 0 : i32
+ // CHECK: %[[C8_i32:.*]] = arith.constant 8 : i32
+ // CHECK: %[[MEMSPACECAST:.*]] = memref.memory_space_cast %[[ARG0]]
+ // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST:.*]] : memref<8x16xf32> -> index
+ // CHECK: %[[VAR0:.*]] = arith.index_castui %[[INTPTR]] : index to i64
%srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32>
+ // CHECK: %[[MEMSPACECAST_0:.*]] = memref.memory_space_cast %[[ARG1]]
+ // CHECK: %[[INTPTR_1:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST_0]] : memref<8x16xf32> -> index
+ // CHECK: %[[VAR1:.*]] = arith.index_castui %[[INTPTR_1:.*]] : index to i64
%dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32>
- // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
- // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
- // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
- // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
%src_tdesc = xegpu.create_nd_tdesc %srcce : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-
- //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
- //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
- //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
- //CHECK: %[[LD_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32>
- //CHECK: %[[LD_TILE_W64:.*]] = arith.constant 0 : i64
- //CHECK: %[[LD_TILE_W:.*]] = arith.trunci %[[LD_TILE_W64]] : i64 to i32
- //CHECK: %[[LD_TILE_H64:.*]] = arith.constant 0 : i64
- //CHECK: %[[LD_TILE_H:.*]] = arith.trunci %[[LD_TILE_H64]] : i64 to i32
- //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1>
- //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
- //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32
- //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]],
- //CHECK-SAME: %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]]
- //CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
- //CHECK-SAME: pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false,
- //CHECK-SAME: v_blocks = 1 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+ // CHECK: %[[VAR2:.*]] = llvm.inttoptr %[[VAR0]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[VAR3:.*]] = xevm.blockload2d %[[VAR2]], %[[C64_i32]], %[[C8_i32]], %[[C64_i32]],
+ // CHECK-SAME: %[[C0_i32]], %[[C0_i32]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32,
+ // CHECK-SAME: tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
+ // CHECK: %[[VAR4:.*]] = vector.bitcast %[[VAR3]] : vector<8xi32> to vector<8xf32>
%loaded = xegpu.load_nd %src_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
- //CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32>
%tid_x = gpu.thread_id x
%tid_x_i32 = arith.index_cast %tid_x : index to i32
%tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
- //CHECK: %[[LOADED_F32_MODIFIED:.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32]] [0] : f32 into vector<8xf32>
+ // CHECK: %[[VAR7:.*]] = vector.insert
%loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
- // CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
- // CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
- // CHECK: %[[DESC_0:.*]] = vector.insert %[[PTR_AS_I64]], %[[CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
- // CHECK: %[[DESC_1:.*]] = vector.bitcast %[[DESC_0]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[DESC_2:.*]] = vector.insert {{.*}}, %[[DESC_1]] [2] : i32 into vector<8xi32>
- // CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32>
- // CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32>
- // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32>
%dst_tdesc = xegpu.create_nd_tdesc %dstte : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
- //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64>
- //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64>
- //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32>
- //CHECK: %[[BASE_H:.*]] = vector.extract %[[DESC]][3] : i32 from vector<8xi32>
- //CHECK: %[[TILE_W64:.*]] = arith.constant 0 : i64
- //CHECK: %[[TILE_W:.*]] = arith.trunci %[[TILE_W64]] : i64 to i32
- //CHECK: %[[TILE_H64:.*]] = arith.constant 0 : i64
- //CHECK: %[[TILE_H:.*]] = arith.trunci %[[TILE_H64]] : i64 to i32
- //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1>
- //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
- //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32
- //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32>
- //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]],
- //CHECK-SAME: %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]]
- //CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
- //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+ // CHECK: %[[VAR8:.*]] = llvm.inttoptr %[[VAR1]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[VAR9:.*]] = vector.bitcast %[[VAR7]] : vector<8xf32> to vector<8xi32>
+ // CHECK: xevm.blockstore2d %[[VAR8]], %[[C64_i32]], %[[C8_i32]], %[[C64_i32]], %[[C0_i32]], %[[C0_i32]], %[[VAR9]]
+ // CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
+ // CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32}>
xegpu.store_nd %loaded_modified, %dst_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
: vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
gpu.return
>From 05c889acaa07a47e3aa13de207a01a0a5c85788d Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 10:52:43 -0700
Subject: [PATCH 05/10] Update test check.
---
.../Conversion/XeGPUToXeVM/prefetch_nd.mlir | 50 ++++++++-----------
1 file changed, 22 insertions(+), 28 deletions(-)
diff --git a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
index ae87951a33447..09f2108cc5aed 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
@@ -1,40 +1,34 @@
// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
gpu.module @prefetch_nd_check {
+ // CHECK-LABEL: gpu.func @prefetch_nd(
+ // CHECK-SAME: %[[ARG0:.*]]: memref<8x16xf32, 1>, %[[ARG1:.*]]: memref<8x16xf32, 1>) kernel {
gpu.func @prefetch_nd(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
+ // CHECK: %[[MEMSPACECAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<8x16xf32, 1> to memref<8x16xf32>
+ // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST]] : memref<8x16xf32> -> index
+ // CHECK: %[[VAR0:.*]] = arith.index_castui %[[INTPTR]] : index to i64
%srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32>
+ // CHECK: %[[MEMSPACECAST_0:.*]] = memref.memory_space_cast %[[ARG1]] : memref<8x16xf32, 1> to memref<8x16xf32>
%dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32>
- // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
- // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
- // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
- // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
%src_tdesc = xegpu.create_nd_tdesc %srcce : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32,
- #xegpu.block_tdesc_attr<memory_space = global>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-
- //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
- //CHECK: %[[PREF_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
- //CHECK: %[[PREF_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
- //CHECK: %[[PREF_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32>
- //CHECK: %[[PREF_TILE_W64:.*]] = arith.constant 0 : i64
- //CHECK: %[[PREF_TILE_W:.*]] = arith.trunci %[[PREF_TILE_W64]] : i64 to i32
- //CHECK: %[[PREF_TILE_H64:.*]] = arith.constant 0 : i64
- //CHECK: %[[PREF_TILE_H:.*]] = arith.trunci %[[PREF_TILE_H64]] : i64 to i32
- //CHECK: %[[PREF_LLVMPTR:.*]] = llvm.inttoptr %[[PREF_INTPTR]] : i64 to !llvm.ptr<1>
- //CHECK: %[[PREF_SIZEOF_F32:.*]] = arith.constant 4 : i32
- //CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[PREF_BASE_W]], %[[PREF_SIZEOF_F32]] : i32
- //CHECK: xevm.blockprefetch2d %[[PREF_LLVMPTR]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_BASE_H]],
- //CHECK-SAME: %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_TILE_W]], %[[PREF_TILE_H]]
- //CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
- //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}>
- //CHECK-SAME: : (!llvm.ptr<1>, i32, i32, i32, i32, i32)
+ #xegpu.block_tdesc_attr<memory_space = global>>
+ // CHECK: %[[C16_I64:.*]] = arith.constant 16 : i64
+ // CHECK: %[[VAR1:.*]] = arith.trunci %[[C16_I64]] : i64 to i32
+ // CHECK: %[[C8_I64:.*]] = arith.constant 8 : i64
+ // CHECK: %[[VAR2:.*]] = arith.trunci %[[C8_I64]] : i64 to i32
+ // CHECK: %[[C0_I64:.*]] = arith.constant 0 : i64
+ // CHECK: %[[VAR3:.*]] = arith.trunci %[[C0_I64]] : i64 to i32
+ // CHECK: %[[C0_I64_1:.*]] = arith.constant 0 : i64
+ // CHECK: %[[VAR4:.*]] = arith.trunci %[[C0_I64_1]] : i64 to i32
+ // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR0]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
+ // CHECK: %[[VAR6:.*]] = arith.muli %[[VAR1]], %[[C4_I32]] : i32
+ // CHECK: xevm.blockprefetch2d %[[VAR5]], %[[VAR6]], %[[VAR2]], %[[VAR6]], %[[VAR3]], %[[VAR4]]
+ // CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
+ // CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}>
xegpu.prefetch_nd %src_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>,
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
gpu.return
}
>From 9a2ea5f5b45586bff911509739aa0cef66e61933 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 17:55:53 +0000
Subject: [PATCH 06/10] Update test check.
---
.../XeGPUToXeVM/create_nd_tdesc.mlir | 48 ++++++-------------
1 file changed, 14 insertions(+), 34 deletions(-)
diff --git a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
index d6e36fa73bf04..38d2c6483c204 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
@@ -4,45 +4,25 @@ gpu.module @create_nd_tdesc {
// CHECK-LABEL: gpu.func @create_nd_tdesc
// CHECK-SAME: %[[ARG0:.*]]: memref<16x32xf32, 1>, %[[ARG1:.*]]: ui64,
// CHECK-SAME: %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index, %[[ARG7:.*]]: index
+ // CHECK-SAME: %[[ARG8:.*]]: memref<?x?xf16>) kernel {
gpu.func @create_nd_tdesc(%src: memref<16x32xf32, 1>, %ptr: ui64, %shape1: index, %shape2: index,
- %stride1: index, %stride2: index, %offset1: index, %offset2: index) kernel {
- // CHECK: %[[VAR0:.*]] = index.castu %[[ARG1]] : ui64 to index
- // CHECK: %[[BASE_ADDR:.*]] = arith.index_castui %[[VAR0]] : index to i64
- // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32>
- // CHECK: %[[OFFSET_W:.*]] = arith.constant 0 : i32
- // CHECK: %[[OFFSET_H:.*]] = arith.constant 0 : i32
- // CHECK: %[[SHAPE_W:.*]] = arith.index_cast %[[ARG3]] : index to i32
- // CHECK: %[[SHAPE_H:.*]] = arith.index_cast %[[ARG2]] : index to i32
- // CHECK: %[[VAR6:.*]] = vector.bitcast %[[CST]] : vector<8xi32> to vector<4xi64>
- // CHECK: %[[VAR7:.*]] = vector.insert %[[BASE_ADDR]], %[[VAR6]] [0] : i64 into vector<4xi64>
- // CHECK: %[[VAR8:.*]] = vector.bitcast %[[VAR7]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[VAR9:.*]] = vector.insert %[[SHAPE_W]], %[[VAR8]] [2] : i32 into vector<8xi32>
- // CHECK: %[[VAR10:.*]] = vector.insert %[[SHAPE_H]], %[[VAR9]] [3] : i32 into vector<8xi32>
- // CHECK: %[[VAR11:.*]] = vector.insert %[[OFFSET_W]], %[[VAR10]] [4] : i32 into vector<8xi32>
- // CHECK: %[[VAR12:.*]] = vector.insert %[[OFFSET_H]], %[[VAR11]] [5] : i32 into vector<8xi32>
+ %stride1: index, %stride2: index, %offset1: index, %offset2: index, %dyn: memref<?x?xf16>) kernel {
+ // Optimized away
%ptr_tdesc = xegpu.create_nd_tdesc %ptr, shape:[%shape1, %shape2], strides:[%stride1, %stride2]
: ui64 -> !xegpu.tensor_desc<8x16xf32>
-
- // CHECK: %[[MEMSPACECAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<16x32xf32, 1> to memref<16x32xf32>
+ // CHECK-NEXT: %[[MEMSPACECAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<16x32xf32, 1> to memref<16x32xf32>
%srcce = memref.memory_space_cast %src : memref<16x32xf32, 1> to memref<16x32xf32>
-
- // CHECK: %[[CST_1:.*]] = arith.constant dense<0> : vector<8xi32>
- // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST]] : memref<16x32xf32> -> index
- // CHECK: %[[OFFSET_W2:.*]] = arith.constant 0 : i32
- // CHECK: %[[OFFSET_H2:.*]] = arith.constant 0 : i32
- // CHECK: %[[C32_I64:.*]] = arith.constant 32 : i64
- // CHECK: %[[SHAPE_W2:.*]] = arith.trunci %[[C32_I64]] : i64 to i32
- // CHECK: %[[C16_I64:.*]] = arith.constant 16 : i64
- // CHECK: %[[SHAPE_H2:.*]] = arith.trunci %[[C16_I64]] : i64 to i32
- // CHECK: %[[BASE_ADDR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
- // CHECK: %[[VAR14:.*]] = vector.bitcast %[[CST_1]] : vector<8xi32> to vector<4xi64>
- // CHECK: %[[VAR15:.*]] = vector.insert %[[BASE_ADDR2]], %[[VAR14]] [0] : i64 into vector<4xi64>
- // CHECK: %[[VAR16:.*]] = vector.bitcast %[[VAR15]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[VAR17:.*]] = vector.insert %[[SHAPE_W2]], %[[VAR16]] [2] : i32 into vector<8xi32>
- // CHECK: %[[VAR18:.*]] = vector.insert %[[SHAPE_H2]], %[[VAR17]] [3] : i32 into vector<8xi32>
- // CHECK: %[[VAR19:.*]] = vector.insert %[[OFFSET_W2]], %[[VAR18]] [4] : i32 into vector<8xi32>
- // CHECK: %[[PAYLOAD:.*]] = vector.insert %[[OFFSET_H2]], %[[VAR19]] [5] : i32 into vector<8xi32>
+ // Optimized away
%src_tdesc = xegpu.create_nd_tdesc %srcce : memref<16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK-NEXT: %c1 = arith.constant 1 : index
+ %c1 = arith.constant 1 : index
+ // CHECK-NEXT: %c64 = arith.constant 64 : index
+ %size_x = arith.constant 64 : index
+ // CHECK-NEXT: %c16 = arith.constant 16 : index
+ %BLOCK_DMODEL = arith.constant 16 : index
+ // Optimized away
+ %dyn_tdesc = xegpu.create_nd_tdesc %dyn, shape: [%size_x, %BLOCK_DMODEL], strides: [%BLOCK_DMODEL, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<16x16xf16>
+ // CHECK-NEXT: gpu.return
gpu.return
}
}
>From a79bd40420abb3956a49cff69e44484d5fbc7887 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 18:23:57 +0000
Subject: [PATCH 07/10] Update test check.
---
.../loadstore_nd_high_base_rank.mlir | 24 ++++++-
.../XeGPUToXeVM/loadstore_nd_int_addr.mlir | 72 ++++++-------------
2 files changed, 45 insertions(+), 51 deletions(-)
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
index e328517634b03..d80f12c06a58a 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
@@ -1,23 +1,45 @@
// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
gpu.module @load_store_check {
- // CHECK: fail
+ // CHECK-LABEL: gpu.func @load_store
+ // CHECK-SAME: %[[ARG0:.*]]: memref<3x3x8x16xf32, 1>, %[[ARG1:.*]]: memref<3x3x8x16xf32, 1>) kernel {
gpu.func @load_store(%src: memref<3x3x8x16xf32, 1>, %dst: memref<3x3x8x16xf32, 1>) kernel {
+ // CHECK: %[[C32_I32:.*]] = arith.constant 32 : i32
+ // CHECK: %[[C64_I32:.*]] = arith.constant 64 : i32
+ // CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32
+ // CHECK: %[[C72_I32:.*]] = arith.constant 72 : i32
+ // CHECK: %[[MEMSPACECAST:.*]] = memref.memory_space_cast %[[ARG0]] : memref<3x3x8x16xf32, 1> to memref<3x3x8x16xf32>
+ // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST]] : memref<3x3x8x16xf32> -> index
+ // CHECK: %[[VAR0:.*]] = arith.index_castui %[[INTPTR]] : index to i64
%srcce = memref.memory_space_cast %src : memref<3x3x8x16xf32, 1> to memref<3x3x8x16xf32>
+ // CHECK: %[[MEMSPACECAST_0:.*]] = memref.memory_space_cast %[[ARG1]] : memref<3x3x8x16xf32, 1> to memref<3x3x8x16xf32>
+ // CHECK: %[[INTPTR_1:.*]] = memref.extract_aligned_pointer_as_index %[[MEMSPACECAST_0]] : memref<3x3x8x16xf32> -> index
+ // CHECK: %[[VAR1:.*]] = arith.index_castui %[[INTPTR_1]] : index to i64
%dstte = memref.memory_space_cast %dst : memref<3x3x8x16xf32, 1> to memref<3x3x8x16xf32>
%src_tdesc = xegpu.create_nd_tdesc %srcce : memref<3x3x8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ // CHECK: %[[VAR2:.*]] = llvm.inttoptr %[[VAR0]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[LOADED:.*]] = xevm.blockload2d %[[VAR2]], %[[C64_I32]], %[[C72_I32]], %[[C64_I32]],
+ // CHECK-SAME: %[[C0_I32]], %[[C64_I32]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32,
+ // CHECK-SAME: tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
%loaded = xegpu.load_nd %src_tdesc[2, 2, 0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
%tid_x = gpu.thread_id x
%tid_x_i32 = arith.index_cast %tid_x : index to i32
%tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
+ // CHECK: %[[VAR7:.*]] = vector.insert
%loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
%dst_tdesc = xegpu.create_nd_tdesc %dstte : memref<3x3x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+ // CHECK: %[[VAR8:.*]] = llvm.inttoptr %[[VAR1]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[VAR9:.*]] = vector.bitcast %[[VAR7]] : vector<8xf32> to vector<8xi32>
+ // CHECK: xevm.blockstore2d %[[VAR8]], %[[C64_I32]], %[[C72_I32]], %[[C64_I32]], %[[C0_I32]], %[[C32_I32]], %[[VAR9]]
+ // CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, tile_height = 8 : i32, tile_width = 16 : i32}>
xegpu.store_nd %loaded_modified, %dst_tdesc[1, 1, 0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
: vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
gpu.return
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
index ca458eaf231c0..c8ce0b3021b3f 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
@@ -1,73 +1,45 @@
// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
gpu.module @load_store_check {
+ // CHECK-LABEL: gpu.func @load_store
+ // CHECK-SAME: %[[ARG0:.*]]: ui64, %[[ARG1:.*]]: ui32) kernel {
gpu.func @load_store(%src: ui64, %dst: ui32) kernel {
- // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
- // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
- // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
- // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
- // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
+ // CHECK: %[[C64_I32:.*]] = arith.constant 64 : i32
+ // CHECK: %[[C0_I32:.*]] = arith.constant 0
+ // CHECK: %[[C8_I32:.*]] = arith.constant 8 : i32
+ // CHECK: %[[ARG1_IDX:.*]] = index.castu %[[ARG1]] : ui32 to index
+ // CHECK: %[[ARG1_I32:.*]] = arith.index_castui %[[ARG1_IDX]] : index to i32
+ // CHECK: %[[ARG0_IDX:.*]] = index.castu %[[ARG0]] : ui64 to index
+ // CHECK: %[[ARG0_I64:.*]] = arith.index_castui %[[ARG0_IDX]] : index to i64
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c1 = arith.constant 1 : index
%src_tdesc = xegpu.create_nd_tdesc %src, shape:[%c8, %c16], strides:[%c16, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
- //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
- //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
- //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
- //CHECK: %[[LD_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32>
- //CHECK: %[[LD_TILE_W64:.*]] = arith.constant 0 : i64
- //CHECK: %[[LD_TILE_W:.*]] = arith.trunci %[[LD_TILE_W64]] : i64 to i32
- //CHECK: %[[LD_TILE_H64:.*]] = arith.constant 0 : i64
- //CHECK: %[[LD_TILE_H:.*]] = arith.trunci %[[LD_TILE_H64]] : i64 to i32
- //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1>
- //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
- //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32
- //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]],
- //CHECK-SAME: %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]]
- //CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
- //CHECK-SAME: pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false,
- //CHECK-SAME: v_blocks = 1 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+ // CHECK: %[[VAR4:.*]] = llvm.inttoptr %[[ARG0_I64]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[LOAD:.*]] = xevm.blockload2d %[[VAR4]], %[[C64_I32]], %[[C8_I32]], %[[C64_I32]],
+ // CHECK-SAME: %[[C0_I32]], %[[C0_I32]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32,
+ // CHECK-SAME: tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
+ // CHECK: %[[VAR6:.*]] = vector.bitcast %[[LOAD]] : vector<8xi32> to vector<8xf32>
%loaded = xegpu.load_nd %src_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
: !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
- //CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32>
%tid_x = gpu.thread_id x
%tid_x_i32 = arith.index_cast %tid_x : index to i32
%tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
- //CHECK: %[[LOADED_F32_MODIFIED:.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32]] [0] : f32 into vector<8xf32>
+ // CHECK: %[[VAR9:.*]] = vector.insert
%loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
- // CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
- // CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
- // CHECK: %[[DESC_0:.*]] = vector.insert %[[PTR_AS_I64]], %[[CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
- // CHECK: %[[DESC_1:.*]] = vector.bitcast %[[DESC_0]] : vector<4xi64> to vector<8xi32>
- // CHECK: %[[DESC_2:.*]] = vector.insert {{.*}}, %[[DESC_1]] [2] : i32 into vector<8xi32>
- // CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32>
- // CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32>
- // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32>
+ // CHECK: %[[VAR10:.*]] = arith.extui %[[ARG1_I32]] : i32 to i64
%dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%c8, %c16], strides:[%c16, %c1] : ui32 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
- //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64>
- //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64>
- //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32>
- //CHECK: %[[BASE_H:.*]] = vector.extract %[[DESC]][3] : i32 from vector<8xi32>
- //CHECK: %[[TILE_W64:.*]] = arith.constant 0 : i64
- //CHECK: %[[TILE_W:.*]] = arith.trunci %[[TILE_W64]] : i64 to i32
- //CHECK: %[[TILE_H64:.*]] = arith.constant 0 : i64
- //CHECK: %[[TILE_H:.*]] = arith.trunci %[[TILE_H64]] : i64 to i32
- //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1>
- //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
- //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32
- //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32>
- //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]],
- //CHECK-SAME: %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]]
- //CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
- //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+ // CHECK: %[[VAR11:.*]] = llvm.inttoptr %[[VAR10]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[STORE:.*]] = vector.bitcast %[[VAR9]] : vector<8xf32> to vector<8xi32>
+ // CHECK: xevm.blockstore2d %[[VAR11]], %[[C64_I32]], %[[C8_I32]], %[[C64_I32]], %[[C0_I32]], %[[C0_I32]], %[[STORE]]
+ // CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, tile_height = 8 : i32, tile_width = 16 : i32}>
xegpu.store_nd %loaded_modified, %dst_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
: vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
gpu.return
>From e053be1947ed9a35216b00afba3e1613066f6eb3 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 18:58:10 +0000
Subject: [PATCH 08/10] Fix dynamic stride compute issue and add test case.
---
mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index d7db65d662faf..38303e33e884c 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -143,7 +143,6 @@ translateStoreXeGPUCacheHint(std::optional<xegpu::CachePolicy> L1hint,
}
// Compute the product of sizes in the range [lo, hi) from the sizes array.
-// Note: all sizes are i64.
static Value getProductOfSizes(ConversionPatternRewriter &rewriter,
Location loc, ArrayRef<OpFoldResult> sizes,
size_t lo, size_t hi) {
@@ -152,6 +151,8 @@ static Value getProductOfSizes(ConversionPatternRewriter &rewriter,
for (size_t idx = lo; idx < hi; idx++) {
OpFoldResult ofr = sizes[idx];
Value sizeVal = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
+ sizeVal = getValueOrCreateCastToIndexLike(rewriter, loc,
+ rewriter.getI64Type(), sizeVal);
product = rewriter.createOrFold<arith::MulIOp>(loc, product, sizeVal);
}
return product;
>From 88ab9aaf6a60765f09508c73275bc093d1adc956 Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Wed, 22 Oct 2025 19:17:07 +0000
Subject: [PATCH 09/10] Add more high rank base memory test cases.
---
.../loadstore_nd_high_base_rank_dynamic.mlir | 54 +++++++++++++++++++
.../loadstore_nd_high_base_rank_int_addr.mlir | 52 ++++++++++++++++++
2 files changed, 106 insertions(+)
create mode 100644 mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_dynamic.mlir
create mode 100644 mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_int_addr.mlir
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_dynamic.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_dynamic.mlir
new file mode 100644
index 0000000000000..16ecd978ad307
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_dynamic.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
+
+gpu.module @load_store_check {
+ // CHECK-LABEL: gpu.func @load_store
+ // CHECK-SAME: %[[ARG0:.*]]: memref<?x?x?x?xf32>, %[[ARG1:.*]]: memref<?x?x?x?xf32>) kernel {
+ gpu.func @load_store(%src: memref<?x?x?x?xf32>, %dst: memref<?x?x?x?xf32>) kernel {
+ // CHECK: %[[C32_I32:.*]] = arith.constant 32 : i32
+ // CHECK: %[[C64_I32:.*]] = arith.constant 64 : i32
+ // CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32
+ // CHECK: %[[C72_I32:.*]] = arith.constant 72 : i32
+ // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<?x?x?x?xf32> -> index
+ // CHECK: %[[VAR0:.*]] = arith.index_castui %[[INTPTR]] : index to i64
+ // CHECK: %[[INTPTR_0:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<?x?x?x?xf32> -> index
+ // CHECK: %[[VAR1:.*]] = arith.index_castui %[[INTPTR_0]] : index to i64
+ %dim0 = arith.constant 3 : index
+ %dim1 = arith.constant 3 : index
+ %dim2 = arith.constant 8 : index
+ %dim3 = arith.constant 16 : index
+ %stride3 = arith.constant 1 : index
+ %stride2 = arith.constant 16 : index
+ %stride1 = arith.constant 128 : index
+ %stride0 = arith.constant 384 : index
+
+ %src_tdesc = xegpu.create_nd_tdesc %src, shape:[%dim0, %dim1, %dim2, %dim3],
+ strides:[%stride0, %stride1, %stride2, %stride3] : memref<?x?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: %[[VAR2:.*]] = llvm.inttoptr %[[VAR1]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[LOADED:.*]] = xevm.blockload2d %[[VAR2]], %[[C64_I32]], %[[C72_I32]], %[[C64_I32]],
+ // CHECK-SAME: %[[C0_I32]], %[[C64_I32]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32,
+ // CHECK-SAME: tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
+ // CHECK: %[[LOADED_F32:.*]] = vector.bitcast %[[LOADED]] : vector<8xi32> to vector<8xf32>
+ %loaded = xegpu.load_nd %src_tdesc[2, 2, 0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+
+ %tid_x = gpu.thread_id x
+ %tid_x_i32 = arith.index_cast %tid_x : index to i32
+ %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
+ // CHECK: %[[LOADED_MODIFIED:.*]] = vector.insert
+ %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
+
+ %dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%dim0, %dim1, %dim2, %dim3],
+ strides:[%stride0, %stride1, %stride2, %stride3] : memref<?x?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+
+ // CHECK: %[[VAR8:.*]] = llvm.inttoptr %[[VAR0]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[LOADED_MODIFIED_BC:.*]] = vector.bitcast %[[LOADED_MODIFIED]] : vector<8xf32> to vector<8xi32>
+ // CHECK: xevm.blockstore2d %[[VAR8]], %[[C64_I32]], %[[C72_I32]], %[[C64_I32]],
+ // CHECK-SAME: %[[C0_I32]], %[[C32_I32]], %[[LOADED_MODIFIED_BC]] <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>,
+ // CHECK-SAME: elem_size_in_bits = 32 : i32, tile_height = 8 : i32, tile_width = 16 : i32}>
+ xegpu.store_nd %loaded_modified, %dst_tdesc[1, 1, 0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+ }
+}
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_int_addr.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_int_addr.mlir
new file mode 100644
index 0000000000000..428534c628314
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank_int_addr.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize -cse %s | FileCheck %s
+
+gpu.module @load_store_check {
+ // CHECK-LABEL: gpu.func @load_store
+ // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64,
+ // CHECK-SAME: %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index, %[[ARG7:.*]]: index, %[[ARG8:.*]]: index, %[[ARG9:.*]]: index
+ gpu.func @load_store(%src: i64, %dst: i64, %dim0: index, %dim1: index, %dim2: index, %dim3: index,
+ %stride0: index, %stride1: index, %stride2: index, %stride3: index) kernel {
+ // CHECK: %[[C2:.*]] = arith.constant 2 : index
+ // CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32
+ // CHECK: %[[C4_I32:.*]] = arith.constant 4 : i32
+ // CHECK: %[[VAR0:.*]] = arith.index_cast %[[ARG5]] : index to i32
+ // CHECK: %[[VAR1:.*]] = arith.index_cast %[[ARG2]] : index to i64
+ // CHECK: %[[VAR2:.*]] = arith.index_cast %[[ARG3]] : index to i64
+ // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[VAR2]] : i64
+ // CHECK: %[[VAR4:.*]] = arith.index_cast %[[ARG4]] : index to i64
+ // CHECK: %[[VAR5:.*]] = arith.muli %[[VAR3]], %[[VAR4]] : i64
+ // CHECK: %[[VAR6:.*]] = arith.trunci %[[VAR5]] : i64 to i32
+ // CHECK: %[[VAR7:.*]] = arith.muli %[[ARG4]], %[[C2]] : index
+ // CHECK: %[[VAR8:.*]] = arith.muli %[[ARG4]], %[[ARG3]] : index
+ // CHECK: %[[VAR9:.*]] = arith.muli %[[VAR8]], %[[C2]] : index
+ // CHECK: %[[VAR10:.*]] = arith.addi %[[VAR7]], %[[VAR9]] : index
+ // CHECK: %[[VAR11:.*]] = arith.index_cast %[[VAR10]] : index to i32
+ %src_tdesc = xegpu.create_nd_tdesc %src, shape:[%dim0, %dim1, %dim2, %dim3],
+ strides:[%stride0, %stride1, %stride2, %stride3] : i64 -> !xegpu.tensor_desc<8x16xf32>
+
+ // CHECK: %[[SRC_PTR:.*]] = llvm.inttoptr %[[ARG0]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[VAR13:.*]] = arith.muli %[[VAR0]], %[[C4_I32]] : i32
+ // CHECK: %[[LOADED:.*]] = xevm.blockload2d %[[SRC_PTR]], %[[VAR13]], %[[VAR6]], %[[VAR13]], %[[C0_I32]], %[[VAR11]] <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32, pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false, v_blocks = 1 : i32}>
+ // CHECK: %[[VAR15:.*]] = vector.bitcast %[[LOADED]] : vector<8xi32> to vector<8xf32>
+ %loaded = xegpu.load_nd %src_tdesc[2, 2, 0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+
+ %tid_x = gpu.thread_id x
+ %tid_x_i32 = arith.index_cast %tid_x : index to i32
+ %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
+ // CHECK: %[[LOADED_MODIFIED:.*]] = vector.insert
+ %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
+
+ // CHECK: %[[VAR19:.*]] = arith.addi %[[ARG4]], %[[VAR8]] : index
+ // CHECK: %[[VAR20:.*]] = arith.index_cast %[[VAR19]] : index to i32
+ %dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%dim0, %dim1, %dim2, %dim3],
+ strides:[%stride0, %stride1, %stride2, %stride3] : i64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+
+ // CHECK: %[[DST_PTR:.*]] = llvm.inttoptr %[[ARG1]] : i64 to !llvm.ptr<1>
+ // CHECK: %[[LOADED_MODIFIED_BITCAST:.*]] = vector.bitcast %[[LOADED_MODIFIED]] : vector<8xf32> to vector<8xi32>
+ // CHECK: xevm.blockstore2d %[[DST_PTR]], %[[VAR13]], %[[VAR6]], %[[VAR13]], %[[C0_I32]], %[[VAR20]], %[[LOADED_MODIFIED_BITCAST]] <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32, tile_height = 8 : i32, tile_width = 16 : i32}>
+ xegpu.store_nd %loaded_modified, %dst_tdesc[1, 1, 0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+ gpu.return
+ }
+}
>From de7c9644d9c7c587861225966012f8880e6fbd8e Mon Sep 17 00:00:00 2001
From: "Lee, Sang Ik" <sang.ik.lee at intel.com>
Date: Fri, 24 Oct 2025 16:44:37 +0000
Subject: [PATCH 10/10] Tensor descriptor may not be directly fed from
create_nd_tdesc op. For example, if passed as a block arg or func arg.
---
mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 38303e33e884c..cee051ab4dd7d 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -193,6 +193,9 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
auto tdVal = op.getTensorDesc();
xegpu::CreateNdDescOp descOp =
tdVal.template getDefiningOp<xegpu::CreateNdDescOp>();
+ if (!descOp)
+ return rewriter.notifyMatchFailure(
+ op, "Expected tensor descriptor to be created by CreateNdDescOp.");
auto mixedStrides = descOp.getMixedStrides();
auto mixedOffsets = op.getMixedOffsets();
auto mixedSizes = descOp.getMixedSizes();
More information about the Mlir-commits
mailing list