[Mlir-commits] [mlir] [mlir][xegpu] Add support for `vector.transfer_read/write` on SLM buffers (PR #192757)
Charitha Saumya
llvmlistbot at llvm.org
Fri Apr 17 17:15:21 PDT 2026
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/192757
>From 1b937b0d57cc133b4a550e219f7d792d5081d83b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:04:31 +0000
Subject: [PATCH 1/4] save work
---
.../mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 5 +
.../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 2 +-
.../VectorToXeGPU/VectorToXeGPU.cpp | 98 ++++++++++++++++---
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 17 ++++
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 12 ---
.../VectorToXeGPU/transfer-read-to-xegpu.mlir | 47 ++++++++-
.../transfer-write-to-xegpu.mlir | 20 ++++
7 files changed, 168 insertions(+), 33 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index c173b93face98..dc6e972e3b5a6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -42,6 +42,11 @@ def XeGPU_Dialect : Dialect {
/// and data factors provided by the LayoutAttr.
static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::DistributeLayoutAttr attr);
+ /// Checks if the given memref type represents shared local memory (SLM).
+ /// Returns true if the memory space is address space 3, MemorySpace::SLM,
+ /// xevm::AddrSpace::SHARED, or a GPU workgroup memory address space.
+ static bool isSharedMemory(const MemRefType &memrefTy);
+
/// drops/slices the shape in the specified dims, and return the rest. e.g.,
/// for shape = [32, 64, 8], dims = [0, 2], it will return [64]
template<typename T, typename U>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f41c0bf1fd2b2..8256d8024d5ad 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -41,7 +41,7 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
let mnemonic = typeMnemonic;
}
-def isSharedPred : CPred<"isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
+def isSharedPred : CPred<"XeGPUDialect::isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
class StaticShared1DMemRefOf<list<Type> allowedTypes> :
ConfinedType<MemRefRankOf<allowedTypes, [1]>, [HasStaticShapePred, isSharedPred],
"reside in share memory and statically 1d shaped " # MemRefOf<allowedTypes>.summary # " ",
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index bbb6340f14c51..5769489aa2d43 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -547,31 +547,68 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
if (failed(transferPreconditions(rewriter, readOp)))
return failure();
+ auto readMemTy = cast<MemRefType>(readOp.getShapedType());
+ VectorType loadedVecTy = readOp.getVectorType();
+ bool isOutOfBounds = readOp.hasOutOfBoundsDim();
+ // Check if the memref has address space 3 (shared local memory)
+ bool isSharedMemory = xegpu::XeGPUDialect::isSharedMemory(readMemTy);
// TODO:This check needs to be replaced with proper uArch capability check
auto chip = xegpu::getChipStr(readOp);
- if (chip != "pvc" && chip != "bmg") {
- // lower to scattered load Op if the target HW doesn't have 2d block load
- // support
+ // Lower to scattered load Op if the target HW doesn't have 2d block load
+ // support and the load is not from shared memory.
+ if (chip != "pvc" && chip != "bmg" && !isSharedMemory) {
+
// TODO: add support for OutOfBound access
if (readOp.hasOutOfBoundsDim())
return failure();
return lowerToScatteredLoadOp(readOp, rewriter);
}
- VectorType loadedVecTy = readOp.getVectorType();
-
- // Lower using load.gather in 1D case
- if (loadedVecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim())
+ // Handle the 1D non-SLM case using load.gather.
+ if (loadedVecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim() &&
+ !isSharedMemory)
return lowerToScatteredLoadOp(readOp, rewriter);
// Perform common data transfer checks.
- auto readMemTy = cast<MemRefType>(readOp.getShapedType());
+ // TODO: Maybe too strict for SLM case.
if (failed(
storeLoadPreconditions(rewriter, readOp, loadedVecTy, readMemTy)))
return failure();
- bool isOutOfBounds = readOp.hasOutOfBoundsDim();
+ // Handle the SLM case.
+ if (isSharedMemory) {
+ // If the memref is SLM only support 2D case for now.
+ if (loadedVecTy.getRank() != 2)
+ return rewriter.notifyMatchFailure(
+ readOp, "Only 2D vector loads are supported for SLM");
+ AffineMap readMap = readOp.getPermutationMap();
+ if (!readMap.isMinorIdentity())
+ return rewriter.notifyMatchFailure(
+ readOp, "Transpose not supported for SLM loads");
+ // Out of bounds case is not supported for SLM loads.
+ if (isOutOfBounds)
+ return rewriter.notifyMatchFailure(
+ readOp, "Out-of-bounds access is not supported for SLM loads");
+
+ // Create mem_desc for SLM
+ auto memDescType =
+ xegpu::MemDescType::get(rewriter.getContext(), readMemTy.getShape(),
+ readMemTy.getElementType(),
+ /*mem_layout=*/nullptr);
+ auto createMemDescOp = xegpu::CreateMemDescOp::create(
+ rewriter, loc, memDescType, readOp.getBase());
+ // Convert indices to OpFoldResult for LoadMatrixOp
+ SmallVector<OpFoldResult> indices =
+ getAsOpFoldResult(readOp.getIndices());
+ auto loadMatrixOp = xegpu::LoadMatrixOp::create(
+ rewriter, loc, loadedVecTy, createMemDescOp.getResult(), indices,
+ /*layout=*/nullptr);
+
+ rewriter.replaceOp(readOp, loadMatrixOp.getResult());
+ return success();
+ }
+
if (isOutOfBounds && !isZeroConstant(readOp.getPadding()))
return rewriter.notifyMatchFailure(
readOp, "Unsupported non-zero padded out-of-bounds read");
@@ -631,21 +668,24 @@ struct TransferWriteLowering
if (failed(transferPreconditions(rewriter, writeOp)))
return failure();
+ // Perform common data transfer checks.
+ VectorType vecTy = writeOp.getVectorType();
+ auto writeMemTy = cast<MemRefType>(writeOp.getShapedType());
+ // Check if the memref has address space 3 (shared local memory)
+ bool isSharedMemory = xegpu::XeGPUDialect::isSharedMemory(writeMemTy);
// TODO:This check needs to be replaced with proper uArch capability check
auto chip = xegpu::getChipStr(writeOp);
- if (chip != "pvc" && chip != "bmg") {
- // lower to scattered store Op if the target HW doesn't have 2d block
- // store support
+ // Lower to scattered store Op if the target HW doesn't have 2d block
+ // store support and the memref is not SLM.
+ if (chip != "pvc" && chip != "bmg" && !isSharedMemory) {
+
// TODO: add support for OutOfBound access
if (writeOp.hasOutOfBoundsDim())
return failure();
return lowerToScatteredStoreOp(writeOp, rewriter);
}
- // Perform common data transfer checks.
- VectorType vecTy = writeOp.getVectorType();
- auto writeMemTy = cast<MemRefType>(writeOp.getShapedType());
if (failed(storeLoadPreconditions(rewriter, writeOp, vecTy, writeMemTy)))
return failure();
@@ -653,6 +693,34 @@ struct TransferWriteLowering
if (!map.isMinorIdentity())
return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
+ // For shared local memory (address space 3), use create_mem_desc +
+ // store_matrix
+ if (isSharedMemory) {
+ // Only support 2D case for now.
+ if (vecTy.getRank() != 2)
+ return rewriter.notifyMatchFailure(
+ writeOp, "Only 2D vector stores are supported for SLM");
+ // Create mem_desc for SLM
+ auto memDescType =
+ xegpu::MemDescType::get(rewriter.getContext(), writeMemTy.getShape(),
+ writeMemTy.getElementType(),
+ /*mem_layout=*/nullptr);
+
+ auto createMemDescOp = xegpu::CreateMemDescOp::create(
+ rewriter, loc, memDescType, writeOp.getBase());
+
+ // Convert indices to OpFoldResult for StoreMatrixOp
+ SmallVector<OpFoldResult> indices =
+ getAsOpFoldResult(writeOp.getIndices());
+
+ xegpu::StoreMatrixOp::create(rewriter, loc, writeOp.getVector(),
+ createMemDescOp.getResult(), indices,
+ /*layout=*/nullptr);
+
+ rewriter.eraseOp(writeOp);
+ return success();
+ }
+
auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
rewriter, loc, writeOp.getBase(),
getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 950371e17255f..d1f4b712b9246 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,6 +8,8 @@
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
@@ -15,6 +17,7 @@
#include "mlir/IR/DialectImplementation.h"
#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
using std::optional;
@@ -121,6 +124,20 @@ static SmallVector<SmallVector<int64_t>> genStaticCoordinates(
return coordinates;
}
+// Checks if the given memref type represents shared local memory (SLM).
+bool XeGPUDialect::isSharedMemory(const MemRefType &memrefTy) {
+ Attribute attr = memrefTy.getMemorySpace();
+ if (!attr)
+ return false; // Default memory space is not shared local memory
+ if (auto intAttr = llvm::dyn_cast_if_present<IntegerAttr>(attr))
+ return intAttr.getInt() == 3;
+ if (auto memrefSpace = llvm::dyn_cast_if_present<MemorySpaceAttr>(attr))
+ return memrefSpace.getValue() == MemorySpace::SLM;
+ if (auto xevmSpace = llvm::dyn_cast_if_present<xevm::AddrSpaceAttr>(attr))
+ return xevmSpace.getValue() == xevm::AddrSpace::SHARED;
+ return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
+}
+
// Checks if the given shape can be evenly distributed based on the layout
// and data factors provided by the LayoutAttr.
bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 5697097a4c999..6955cd1e26a53 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,7 +8,6 @@
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -23,17 +22,6 @@
using namespace mlir;
using namespace mlir::xegpu;
-static bool isSharedMemory(const MemRefType &memrefTy) {
- Attribute attr = memrefTy.getMemorySpace();
- if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
- return intAttr.getInt() == 3;
- if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
- return memrefSpace.getValue() == MemorySpace::SLM;
- if (auto xevmSpace = llvm::dyn_cast<xevm::AddrSpaceAttr>(attr))
- return xevmSpace.getValue() == xevm::AddrSpace::SHARED;
- return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
-}
-
template <typename T>
static std::string makeString(T array, bool breakline = false) {
std::string buf;
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index 1a19c8a13f120..7f9fe1be67438 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -116,8 +116,8 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
// LOAD-ND-LABEL: @load_transposed(
// LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>,
-// LOAD-ND-SAME: %[[OFFSET1:.+]]: index,
-// LOAD-ND-SAME: %[[OFFSET2:.+]]: index
+// LOAD-ND-SAME: %[[OFFSET1:.+]]: index,
+// LOAD-ND-SAME: %[[OFFSET2:.+]]: index
// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
// LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32
// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]]
@@ -221,7 +221,7 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
// LOAD-GATHER-DAG: %[[OFFSETS:.+]] = arith.addi %[[BCASTIDX]], {{.*}} : vector<8x16xindex>
// LOAD-GATHER-DAG: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %arg0 : memref<?x8x16xf32> -> index
// LOAD-GATHER-DAG: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
-// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
}
@@ -439,7 +439,7 @@ gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %of
// LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>,
// LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
// LOAD-ND: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
-// LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
// LOAD-ND: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex>
// LOAD-ND: arith.muli {{.*}} : index
@@ -455,7 +455,7 @@ gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %of
// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>,
// LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
// LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
-// LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
// LOAD-GATHER: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
// LOAD-GATHER: %[[STEP:.+]] = vector.step : vector<8xindex>
// LOAD-GATHER: arith.muli {{.*}} : index
@@ -510,3 +510,40 @@ gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %of
// LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16>
}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @load_2D_vector_addrspace3(%source: memref<16x32xf32, 3>,
+ %offset: index) -> vector<8x16xf32> {
+ %c0 = arith.constant 0.0 : f32
+ %0 = vector.transfer_read %source[%offset, %offset], %c0
+ {in_bounds = [true, true]} : memref<16x32xf32, 3>, vector<8x16xf32>
+ gpu.return %0 : vector<8x16xf32>
+}
+
+// LOAD-ND-LABEL: @load_2D_vector_addrspace3
+// LOAD-ND-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// LOAD-ND-SAME: %[[OFFSET:.+]]: index
+// LOAD-ND: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// LOAD-ND: %[[DATA:.+]] = xegpu.load_matrix %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : !xegpu.mem_desc<16x32xf32>, index, index -> vector<8x16xf32>
+// LOAD-ND: gpu.return %[[DATA]] : vector<8x16xf32>
+
+}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @load_1D_vector_addrspace3_unsupported(%source: memref<32xf32, 3>,
+ %offset: index) -> vector<8xf32> {
+ %c0 = arith.constant 0.0 : f32
+ %0 = vector.transfer_read %source[%offset], %c0
+ {in_bounds = [true]} : memref<32xf32, 3>, vector<8xf32>
+ gpu.return %0 : vector<8xf32>
+}
+
+// LOAD-ND-LABEL: @load_1D_vector_addrspace3_unsupported
+// LOAD-ND: vector.transfer_read
+
+// LOAD-GATHER-LABEL: @load_1D_vector_addrspace3_unsupported
+// LOAD-GATHER: vector.transfer_read
+
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index 66da64225678e..abf4b031937f3 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -344,3 +344,23 @@ gpu.func @store_to_subview(%vec: vector<8xf16>,
// STORE-SCATTER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
// STORE-SCATTER: xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8xf16>, i64, vector<8xindex>, vector<8xi1>
}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @store_2D_vector_addrspace3(%vec: vector<8x16xf32>,
+ %source: memref<16x32xf32, 3>, %offset: index) {
+ vector.transfer_write %vec, %source[%offset, %offset]
+ {in_bounds = [true, true]}
+ : vector<8x16xf32>, memref<16x32xf32, 3>
+ gpu.return
+}
+
+// STORE-ND-LABEL: @store_2D_vector_addrspace3
+// STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>
+// STORE-ND-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// STORE-ND-SAME: %[[OFFSET:.+]]: index
+// STORE-ND: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// STORE-ND: xegpu.store_matrix %[[VEC]], %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>, !xegpu.mem_desc<16x32xf32>, index, index
+// STORE-ND: gpu.return
+
+}
>From 0b3b945961d0093615deab8695f1b4464a23342d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:08:47 +0000
Subject: [PATCH 2/4] save work
---
.../Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index 7f9fe1be67438..f7bcbe17c542e 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -528,6 +528,13 @@ gpu.func @load_2D_vector_addrspace3(%source: memref<16x32xf32, 3>,
// LOAD-ND: %[[DATA:.+]] = xegpu.load_matrix %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : !xegpu.mem_desc<16x32xf32>, index, index -> vector<8x16xf32>
// LOAD-ND: gpu.return %[[DATA]] : vector<8x16xf32>
+// LOAD-GATHER-LABEL: @load_2D_vector_addrspace3
+// LOAD-GATHER-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// LOAD-GATHER-SAME: %[[OFFSET:.+]]: index
+// LOAD-GATHER: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// LOAD-GATHER: %[[DATA:.+]] = xegpu.load_matrix %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : !xegpu.mem_desc<16x32xf32>, index, index -> vector<8x16xf32>
+// LOAD-GATHER: gpu.return %[[DATA]] : vector<8x16xf32>
+
}
// -----
>From 8b11effdd59b7947f17c0e32ddde4e944661d6ae Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:09:09 +0000
Subject: [PATCH 3/4] save work
---
.../transfer-write-to-xegpu.mlir | 26 +++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index abf4b031937f3..9dab33bc78e89 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -363,4 +363,30 @@ gpu.func @store_2D_vector_addrspace3(%vec: vector<8x16xf32>,
// STORE-ND: xegpu.store_matrix %[[VEC]], %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>, !xegpu.mem_desc<16x32xf32>, index, index
// STORE-ND: gpu.return
+// STORE-SCATTER-LABEL: @store_2D_vector_addrspace3
+// STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8x16xf32>
+// STORE-SCATTER-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// STORE-SCATTER-SAME: %[[OFFSET:.+]]: index
+// STORE-SCATTER: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// STORE-SCATTER: xegpu.store_matrix %[[VEC]], %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>, !xegpu.mem_desc<16x32xf32>, index, index
+// STORE-SCATTER: gpu.return
+
+}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @store_1D_vector_addrspace3_unsupported(%vec: vector<8xf32>,
+ %source: memref<32xf32, 3>, %offset: index) {
+ vector.transfer_write %vec, %source[%offset]
+ {in_bounds = [true]}
+ : vector<8xf32>, memref<32xf32, 3>
+ gpu.return
+}
+
+// STORE-ND-LABEL: @store_1D_vector_addrspace3_unsupported
+// STORE-ND: vector.transfer_write
+
+// STORE-SCATTER-LABEL: @store_1D_vector_addrspace3_unsupported
+// STORE-SCATTER: vector.transfer_write
+
}
>From 448b28ce0572e383268283454bde7f9de61e6988 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:15:02 +0000
Subject: [PATCH 4/4] save work
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 4 ----
1 file changed, 4 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index dc6e972e3b5a6..b1490c7742a26 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -38,10 +38,6 @@ def XeGPU_Dialect : Dialect {
let useDefaultAttributePrinterParser = true;
let extraClassDeclaration = [{
- /// Checks if the given shape can be evenly distributed based on the layout
- /// and data factors provided by the LayoutAttr.
- static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::DistributeLayoutAttr attr);
-
/// Checks if the given memref type represents shared local memory (SLM).
/// Returns true if the memory space is address space 3, MemorySpace::SLM,
/// xevm::AddrSpace::SHARED, or a GPU workgroup memory address space.
More information about the Mlir-commits
mailing list