[Mlir-commits] [mlir] [mlir][xegpu] Add support for `vector.transfer_read/write` on SLM buffers (PR #192757)

Fri Apr 17 17:15:21 PDT 2026

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/192757

>From 1b937b0d57cc133b4a550e219f7d792d5081d83b Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:04:31 +0000
Subject: [PATCH 1/4] save work

---
 .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td     |  5 +
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  2 +-
 .../VectorToXeGPU/VectorToXeGPU.cpp           | 98 ++++++++++++++++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 17 ++++
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 12 ---
 .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 47 ++++++++-
 .../transfer-write-to-xegpu.mlir              | 20 ++++
 7 files changed, 168 insertions(+), 33 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index c173b93face98..dc6e972e3b5a6 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -42,6 +42,11 @@ def XeGPU_Dialect : Dialect {
       /// and data factors provided by the LayoutAttr.
       static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::DistributeLayoutAttr attr);
 
+      /// Checks if the given memref type represents shared local memory (SLM).
+      /// Returns true if the memory space is address space 3, MemorySpace::SLM,
+      /// xevm::AddrSpace::SHARED, or a GPU workgroup memory address space.
+      static bool isSharedMemory(const MemRefType &memrefTy);
+
       /// drops/slices the shape in the specified dims, and return the rest. e.g.,
       /// for shape = [32, 64, 8], dims = [0, 2], it will return [64]
       template<typename T, typename U>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index f41c0bf1fd2b2..8256d8024d5ad 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -41,7 +41,7 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
   let mnemonic = typeMnemonic;
 }
 
-def isSharedPred : CPred<"isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
+def isSharedPred : CPred<"XeGPUDialect::isSharedMemory(llvm::cast<mlir::MemRefType>($_self))">;
 class StaticShared1DMemRefOf<list<Type> allowedTypes> :
   ConfinedType<MemRefRankOf<allowedTypes, [1]>, [HasStaticShapePred, isSharedPred],
      "reside in share memory and statically 1d shaped " # MemRefOf<allowedTypes>.summary # " ",
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index bbb6340f14c51..5769489aa2d43 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -547,31 +547,68 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
 
     if (failed(transferPreconditions(rewriter, readOp)))
       return failure();
+    auto readMemTy = cast<MemRefType>(readOp.getShapedType());
+    VectorType loadedVecTy = readOp.getVectorType();
+    bool isOutOfBounds = readOp.hasOutOfBoundsDim();
+    // Check if the memref has address space 3 (shared local memory)
+    bool isSharedMemory = xegpu::XeGPUDialect::isSharedMemory(readMemTy);
 
     // TODO:This check needs to be replaced with proper uArch capability check
     auto chip = xegpu::getChipStr(readOp);
-    if (chip != "pvc" && chip != "bmg") {
-      // lower to scattered load Op if the target HW doesn't have 2d block load
-      // support
+    // Lower to scattered load Op if the target HW doesn't have 2d block load
+    // support and the load is not from shared memory.
+    if (chip != "pvc" && chip != "bmg" && !isSharedMemory) {
+
       // TODO: add support for OutOfBound access
       if (readOp.hasOutOfBoundsDim())
         return failure();
       return lowerToScatteredLoadOp(readOp, rewriter);
     }
 
-    VectorType loadedVecTy = readOp.getVectorType();
-
-    // Lower using load.gather in 1D case
-    if (loadedVecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim())
+    // Handle the 1D non-SLM case using load.gather.
+    if (loadedVecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim() &&
+        !isSharedMemory)
       return lowerToScatteredLoadOp(readOp, rewriter);
 
     // Perform common data transfer checks.
-    auto readMemTy = cast<MemRefType>(readOp.getShapedType());
+    // TODO: Maybe too strict for SLM case.
     if (failed(
             storeLoadPreconditions(rewriter, readOp, loadedVecTy, readMemTy)))
       return failure();
 
-    bool isOutOfBounds = readOp.hasOutOfBoundsDim();
+    // Handle the SLM case.
+    if (isSharedMemory) {
+      // If the memref is SLM only support 2D case for now.
+      if (loadedVecTy.getRank() != 2)
+        return rewriter.notifyMatchFailure(
+            readOp, "Only 2D vector loads are supported for SLM");
+      AffineMap readMap = readOp.getPermutationMap();
+      if (!readMap.isMinorIdentity())
+        return rewriter.notifyMatchFailure(
+            readOp, "Transpose not supported for SLM loads");
+      // Out of bounds case is not supported for SLM loads.
+      if (isOutOfBounds)
+        return rewriter.notifyMatchFailure(
+            readOp, "Out-of-bounds access is not supported for SLM loads");
+
+      // Create mem_desc for SLM
+      auto memDescType =
+          xegpu::MemDescType::get(rewriter.getContext(), readMemTy.getShape(),
+                                  readMemTy.getElementType(),
+                                  /*mem_layout=*/nullptr);
+      auto createMemDescOp = xegpu::CreateMemDescOp::create(
+          rewriter, loc, memDescType, readOp.getBase());
+      // Convert indices to OpFoldResult for LoadMatrixOp
+      SmallVector<OpFoldResult> indices =
+          getAsOpFoldResult(readOp.getIndices());
+      auto loadMatrixOp = xegpu::LoadMatrixOp::create(
+          rewriter, loc, loadedVecTy, createMemDescOp.getResult(), indices,
+          /*layout=*/nullptr);
+
+      rewriter.replaceOp(readOp, loadMatrixOp.getResult());
+      return success();
+    }
+
     if (isOutOfBounds && !isZeroConstant(readOp.getPadding()))
       return rewriter.notifyMatchFailure(
           readOp, "Unsupported non-zero padded out-of-bounds read");
@@ -631,21 +668,24 @@ struct TransferWriteLowering
 
     if (failed(transferPreconditions(rewriter, writeOp)))
       return failure();
+    // Perform common data transfer checks.
+    VectorType vecTy = writeOp.getVectorType();
+    auto writeMemTy = cast<MemRefType>(writeOp.getShapedType());
+    // Check if the memref has address space 3 (shared local memory)
+    bool isSharedMemory = xegpu::XeGPUDialect::isSharedMemory(writeMemTy);
 
     // TODO:This check needs to be replaced with proper uArch capability check
     auto chip = xegpu::getChipStr(writeOp);
-    if (chip != "pvc" && chip != "bmg") {
-      // lower to scattered store Op if the target HW doesn't have 2d block
-      // store support
+    // Lower to scattered store Op if the target HW doesn't have 2d block
+    // store support and the memref is not SLM.
+    if (chip != "pvc" && chip != "bmg" && !isSharedMemory) {
+
       // TODO: add support for OutOfBound access
       if (writeOp.hasOutOfBoundsDim())
         return failure();
       return lowerToScatteredStoreOp(writeOp, rewriter);
     }
 
-    // Perform common data transfer checks.
-    VectorType vecTy = writeOp.getVectorType();
-    auto writeMemTy = cast<MemRefType>(writeOp.getShapedType());
     if (failed(storeLoadPreconditions(rewriter, writeOp, vecTy, writeMemTy)))
       return failure();
 
@@ -653,6 +693,34 @@ struct TransferWriteLowering
     if (!map.isMinorIdentity())
       return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
 
+    // For shared local memory (address space 3), use create_mem_desc +
+    // store_matrix
+    if (isSharedMemory) {
+      // Only support 2D case for now.
+      if (vecTy.getRank() != 2)
+        return rewriter.notifyMatchFailure(
+            writeOp, "Only 2D vector stores are supported for SLM");
+      // Create mem_desc for SLM
+      auto memDescType =
+          xegpu::MemDescType::get(rewriter.getContext(), writeMemTy.getShape(),
+                                  writeMemTy.getElementType(),
+                                  /*mem_layout=*/nullptr);
+
+      auto createMemDescOp = xegpu::CreateMemDescOp::create(
+          rewriter, loc, memDescType, writeOp.getBase());
+
+      // Convert indices to OpFoldResult for StoreMatrixOp
+      SmallVector<OpFoldResult> indices =
+          getAsOpFoldResult(writeOp.getIndices());
+
+      xegpu::StoreMatrixOp::create(rewriter, loc, writeOp.getVector(),
+                                   createMemDescOp.getResult(), indices,
+                                   /*layout=*/nullptr);
+
+      rewriter.eraseOp(writeOp);
+      return success();
+    }
+
     auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
         rewriter, loc, writeOp.getBase(),
         getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 950371e17255f..d1f4b712b9246 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,6 +8,8 @@
 
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
@@ -15,6 +17,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 
 using std::optional;
@@ -121,6 +124,20 @@ static SmallVector<SmallVector<int64_t>> genStaticCoordinates(
   return coordinates;
 }
 
+// Checks if the given memref type represents shared local memory (SLM).
+bool XeGPUDialect::isSharedMemory(const MemRefType &memrefTy) {
+  Attribute attr = memrefTy.getMemorySpace();
+  if (!attr)
+    return false; // Default memory space is not shared local memory
+  if (auto intAttr = llvm::dyn_cast_if_present<IntegerAttr>(attr))
+    return intAttr.getInt() == 3;
+  if (auto memrefSpace = llvm::dyn_cast_if_present<MemorySpaceAttr>(attr))
+    return memrefSpace.getValue() == MemorySpace::SLM;
+  if (auto xevmSpace = llvm::dyn_cast_if_present<xevm::AddrSpaceAttr>(attr))
+    return xevmSpace.getValue() == xevm::AddrSpace::SHARED;
+  return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
+}
+
 // Checks if the given shape can be evenly distributed based on the layout
 // and data factors provided by the LayoutAttr.
 bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef<int64_t> shape,
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 5697097a4c999..6955cd1e26a53 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -8,7 +8,6 @@
 
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -23,17 +22,6 @@
 using namespace mlir;
 using namespace mlir::xegpu;
 
-static bool isSharedMemory(const MemRefType &memrefTy) {
-  Attribute attr = memrefTy.getMemorySpace();
-  if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
-    return intAttr.getInt() == 3;
-  if (auto memrefSpace = llvm::dyn_cast<MemorySpaceAttr>(attr))
-    return memrefSpace.getValue() == MemorySpace::SLM;
-  if (auto xevmSpace = llvm::dyn_cast<xevm::AddrSpaceAttr>(attr))
-    return xevmSpace.getValue() == xevm::AddrSpace::SHARED;
-  return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr);
-}
-
 template <typename T>
 static std::string makeString(T array, bool breakline = false) {
   std::string buf;
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index 1a19c8a13f120..7f9fe1be67438 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -116,8 +116,8 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
 
 // LOAD-ND-LABEL:  @load_transposed(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
-// LOAD-ND-SAME:   %[[OFFSET1:.+]]: index, 
-// LOAD-ND-SAME:   %[[OFFSET2:.+]]: index  
+// LOAD-ND-SAME:   %[[OFFSET1:.+]]: index,
+// LOAD-ND-SAME:   %[[OFFSET2:.+]]: index
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32
 // LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]]
@@ -221,7 +221,7 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
 // LOAD-GATHER-DAG:    %[[OFFSETS:.+]] = arith.addi %[[BCASTIDX]], {{.*}} : vector<8x16xindex>
 // LOAD-GATHER-DAG:    %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %arg0 : memref<?x8x16xf32> -> index
 // LOAD-GATHER-DAG:    %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
-// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32> 
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[OFFSETS]]{{\]}}, %[[CST_0]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 
 }
 
@@ -439,7 +439,7 @@ gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %of
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
 // LOAD-ND:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
-// LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
 // LOAD-ND:        %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
 // LOAD-ND:        %[[STEP:.+]] = vector.step : vector<8xindex>
 // LOAD-ND:        arith.muli {{.*}} : index
@@ -455,7 +455,7 @@ gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %of
 // LOAD-GATHER-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-GATHER-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
 // LOAD-GATHER:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
-// LOAD-GATHER:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-GATHER:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
 // LOAD-GATHER:        %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
 // LOAD-GATHER:        %[[STEP:.+]] = vector.step : vector<8xindex>
 // LOAD-GATHER:        arith.muli {{.*}} : index
@@ -510,3 +510,40 @@ gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %of
 // LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
 // LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16>
 }
+
+// -----
+gpu.module @xevm_module {
+gpu.func @load_2D_vector_addrspace3(%source: memref<16x32xf32, 3>,
+    %offset: index) -> vector<8x16xf32> {
+  %c0 = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %source[%offset, %offset], %c0
+    {in_bounds = [true, true]} : memref<16x32xf32, 3>, vector<8x16xf32>
+  gpu.return %0 : vector<8x16xf32>
+}
+
+// LOAD-ND-LABEL: @load_2D_vector_addrspace3
+// LOAD-ND-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// LOAD-ND-SAME: %[[OFFSET:.+]]: index
+// LOAD-ND: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// LOAD-ND: %[[DATA:.+]] = xegpu.load_matrix %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : !xegpu.mem_desc<16x32xf32>, index, index -> vector<8x16xf32>
+// LOAD-ND: gpu.return %[[DATA]] : vector<8x16xf32>
+
+}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @load_1D_vector_addrspace3_unsupported(%source: memref<32xf32, 3>,
+    %offset: index) -> vector<8xf32> {
+  %c0 = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %source[%offset], %c0
+    {in_bounds = [true]} : memref<32xf32, 3>, vector<8xf32>
+  gpu.return %0 : vector<8xf32>
+}
+
+// LOAD-ND-LABEL: @load_1D_vector_addrspace3_unsupported
+// LOAD-ND: vector.transfer_read
+
+// LOAD-GATHER-LABEL: @load_1D_vector_addrspace3_unsupported
+// LOAD-GATHER: vector.transfer_read
+
+}
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index 66da64225678e..abf4b031937f3 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -344,3 +344,23 @@ gpu.func @store_to_subview(%vec: vector<8xf16>,
 // STORE-SCATTER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
 // STORE-SCATTER:        xegpu.store %[[VEC]], %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : vector<8xf16>, i64, vector<8xindex>, vector<8xi1>
 }
+
+// -----
+gpu.module @xevm_module {
+gpu.func @store_2D_vector_addrspace3(%vec: vector<8x16xf32>,
+    %source: memref<16x32xf32, 3>, %offset: index) {
+  vector.transfer_write %vec, %source[%offset, %offset]
+    {in_bounds = [true, true]}
+    : vector<8x16xf32>, memref<16x32xf32, 3>
+  gpu.return
+}
+
+// STORE-ND-LABEL: @store_2D_vector_addrspace3
+// STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>
+// STORE-ND-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// STORE-ND-SAME: %[[OFFSET:.+]]: index
+// STORE-ND: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// STORE-ND: xegpu.store_matrix %[[VEC]], %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>, !xegpu.mem_desc<16x32xf32>, index, index
+// STORE-ND: gpu.return
+
+}

>From 0b3b945961d0093615deab8695f1b4464a23342d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:08:47 +0000
Subject: [PATCH 2/4] save work

---
 .../Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index 7f9fe1be67438..f7bcbe17c542e 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -528,6 +528,13 @@ gpu.func @load_2D_vector_addrspace3(%source: memref<16x32xf32, 3>,
 // LOAD-ND: %[[DATA:.+]] = xegpu.load_matrix %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : !xegpu.mem_desc<16x32xf32>, index, index -> vector<8x16xf32>
 // LOAD-ND: gpu.return %[[DATA]] : vector<8x16xf32>
 
+// LOAD-GATHER-LABEL: @load_2D_vector_addrspace3
+// LOAD-GATHER-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// LOAD-GATHER-SAME: %[[OFFSET:.+]]: index
+// LOAD-GATHER: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// LOAD-GATHER: %[[DATA:.+]] = xegpu.load_matrix %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : !xegpu.mem_desc<16x32xf32>, index, index -> vector<8x16xf32>
+// LOAD-GATHER: gpu.return %[[DATA]] : vector<8x16xf32>
+
 }
 
 // -----

>From 8b11effdd59b7947f17c0e32ddde4e944661d6ae Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:09:09 +0000
Subject: [PATCH 3/4] save work

---
 .../transfer-write-to-xegpu.mlir              | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index abf4b031937f3..9dab33bc78e89 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -363,4 +363,30 @@ gpu.func @store_2D_vector_addrspace3(%vec: vector<8x16xf32>,
 // STORE-ND: xegpu.store_matrix %[[VEC]], %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>, !xegpu.mem_desc<16x32xf32>, index, index
 // STORE-ND: gpu.return
 
+// STORE-SCATTER-LABEL: @store_2D_vector_addrspace3
+// STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8x16xf32>
+// STORE-SCATTER-SAME: %[[SOURCE:.+]]: memref<16x32xf32, 3>
+// STORE-SCATTER-SAME: %[[OFFSET:.+]]: index
+// STORE-SCATTER: %[[MEM_DESC:.+]] = xegpu.create_mem_desc %[[SOURCE]] : memref<16x32xf32, 3> -> !xegpu.mem_desc<16x32xf32>
+// STORE-SCATTER: xegpu.store_matrix %[[VEC]], %[[MEM_DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>, !xegpu.mem_desc<16x32xf32>, index, index
+// STORE-SCATTER: gpu.return
+
+}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @store_1D_vector_addrspace3_unsupported(%vec: vector<8xf32>,
+    %source: memref<32xf32, 3>, %offset: index) {
+  vector.transfer_write %vec, %source[%offset]
+    {in_bounds = [true]}
+    : vector<8xf32>, memref<32xf32, 3>
+  gpu.return
+}
+
+// STORE-ND-LABEL: @store_1D_vector_addrspace3_unsupported
+// STORE-ND: vector.transfer_write
+
+// STORE-SCATTER-LABEL: @store_1D_vector_addrspace3_unsupported
+// STORE-SCATTER: vector.transfer_write
+
 }

>From 448b28ce0572e383268283454bde7f9de61e6988 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Sat, 18 Apr 2026 00:15:02 +0000
Subject: [PATCH 4/4] save work

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index dc6e972e3b5a6..b1490c7742a26 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -38,10 +38,6 @@ def XeGPU_Dialect : Dialect {
     let useDefaultAttributePrinterParser = true;
 
     let extraClassDeclaration = [{
-      /// Checks if the given shape can be evenly distributed based on the layout
-      /// and data factors provided by the LayoutAttr.
-      static bool isEvenlyDistributable(llvm::ArrayRef<int64_t> shape, xegpu::DistributeLayoutAttr attr);
-
       /// Checks if the given memref type represents shared local memory (SLM).
       /// Returns true if the memory space is address space 3, MemorySpace::SLM,
       /// xevm::AddrSpace::SHARED, or a GPU workgroup memory address space.