[Mlir-commits] [mlir] [mlir][XeGPU] Add MemoryEffectsOpInterface for XeGPU memory related ops. (PR #125314)

Mon Feb 3 09:51:44 PST 2025

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/125314

>From ece99af0a9c42390ad77ffb59d6e4473f2fd3644 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 31 Jan 2025 22:55:50 +0000
Subject: [PATCH 1/3] add mem side effects interface

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 393 +++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  26 +-
 2 files changed, 228 insertions(+), 191 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index c2335eecc3781d..d98aa9ffb26f1a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -276,97 +276,103 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 }
 
 
-def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "loads a n-D block from memory (represented by TensorDesc)"
-                "to registers (represented by vector)";
-  let description = [{
-    LoadNdOp essentially mimics the hardware block read instruction to read
-    a block of data from memory to register. It takes a set of optional cache
-    hints for each level of cache, L1, L2 and L3. If hardware does not have a
-    correspoding cache, Corresponding cache hint attribute will be masked.
-    VNNI transformation is an hardware feature for Intel GPU, which is used to
-    do data packing during the load for B operand of matrix operation, if
-    the bit width of the data type is less then 32 bits, e.g., fp16. And
-    transpose is another Intel hardware feature, which will do transpose
-    operation when loading the data if the bit width of the data type is
-    fp32 or fp64. It implies that vnni and transpose cannot exit at the
-    same time.
-
-    Example:
-    ```mlir
-      xegpu.load_nd %1 {transpose = [1, 0],
-                        l1_hint = #xegpu.cache_hint<cached>,
-                        l2_hint = #xegpu.cache_hint<uncached>,
-                        l3_hint = #xegpu.cache_hint<streaming>}
-              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
-    ```
-
-
-  }];
-
-  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                       OptionalAttr<UnitAttr>: $packed,
-                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-  let results = (outs XeGPU_ValueType: $value);
-
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    VectorType getType() {
-      return llvm::dyn_cast<VectorType>(getValue().getType());
-    }
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "loads a n-D block from memory (represented by TensorDesc)"
+                  "to registers (represented by vector)";
+    let description = [{
+      LoadNdOp essentially mimics the hardware block read instruction to read
+      a block of data from memory to register. It takes a set of optional cache
+      hints for each level of cache, L1, L2 and L3. If hardware does not have a
+      correspoding cache, Corresponding cache hint attribute will be masked.
+      VNNI transformation is an hardware feature for Intel GPU, which is used to
+      do data packing during the load for B operand of matrix operation, if
+      the bit width of the data type is less then 32 bits, e.g., fp16. And
+      transpose is another Intel hardware feature, which will do transpose
+      operation when loading the data if the bit width of the data type is
+      fp32 or fp64. It implies that vnni and transpose cannot exit at the
+      same time.
+
+      Example:
+      ```mlir
+        xegpu.load_nd %1 {transpose = [1, 0],
+                          l1_hint = #xegpu.cache_hint<cached>,
+                          l2_hint = #xegpu.cache_hint<uncached>,
+                          l3_hint = #xegpu.cache_hint<streaming>}
+                : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+      ```
+
+
+    }];
+
+    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                        OptionalAttr<UnitAttr>: $packed,
+                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+    let results = (outs XeGPU_ValueType: $value);
+
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      VectorType getType() {
+        return llvm::dyn_cast<VectorType>(getValue().getType());
+      }
 
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
-  }];
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
+    }];
 
-  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
-  let hasVerifier = 1;
+    let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+    let hasVerifier = 1;
 }
 
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
-
-  let description = [{
-    StoreNdOp essentially mimics the hardware block write instruction io
-    write a block of data from register into the memory region as described
-    by the TensorDesc. It takes a set of optional cache hints for each level
-    of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
-    Corresponding cache hint attribute will be masked.
-
-    Example:
-    ```mlir
-      xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                             l2_hint = #xegpu.cache_hint<write_back>,
-                             l3_hint = #xegpu.cache_hint<write_through>}
-                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
-    ```
-
-
-  }];
-
-  let arguments = (ins XeGPU_ValueType: $value,
-                       XeGPU_TensorDesc: $TensorDesc,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    VectorType getValueType() {
-      return llvm::dyn_cast<VectorType>(getValue().getType());
-    }
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+
+    let description = [{
+      StoreNdOp essentially mimics the hardware block write instruction io
+      write a block of data from register into the memory region as described
+      by the TensorDesc. It takes a set of optional cache hints for each level
+      of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+      Corresponding cache hint attribute will be masked.
+
+      Example:
+      ```mlir
+        xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                              l2_hint = #xegpu.cache_hint<write_back>,
+                              l3_hint = #xegpu.cache_hint<write_through>}
+                              : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+      ```
+
+
+    }];
+
+    let arguments = (ins XeGPU_ValueType: $value,
+                        XeGPU_TensorDesc: $TensorDesc,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      VectorType getValueType() {
+        return llvm::dyn_cast<VectorType>(getValue().getType());
+      }
 
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
-  }];
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
+    }];
 
-  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
-                        `:` type($value) `,` qualified(type($TensorDesc))}];
-  let hasVerifier = 1;
+    let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+                          `:` type($value) `,` qualified(type($TensorDesc))}];
+    let hasVerifier = 1;
 }
 
 def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
@@ -548,131 +554,138 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   let hasVerifier = 1;
 }
 
-def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "load a set of scattered data points from memory.";
-
-  let description = [{ It (aka. load) load data per each work-item. The output
-    describes the data being loaded at the subgroup level, so its size is
-    consistent with the number of work-items in a subgroup. When the chunk size
-    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-    to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
-    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
-    due to the hardware implementation. Therefore, a transpose attribute is introduced
-    on purpose, making sure users are aware of this implicit transformation.
-
-    The mask operand masks out memory access so that it is safe to pass out-of-boundary
-    addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
-
-  Example 1:
-  ```mlir
-    %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
-          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
-            vector<16xi1> -> vector<16xf32>
-  ```
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "load a set of scattered data points from memory.";
+
+    let description = [{ It (aka. load) load data per each work-item. The output
+      describes the data being loaded at the subgroup level, so its size is
+      consistent with the number of work-items in a subgroup. When the chunk size
+      is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
+      to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
+      Specially, there is a transpose effect on the result (as compared to the TensorDesc)
+      due to the hardware implementation. Therefore, a transpose attribute is introduced
+      on purpose, making sure users are aware of this implicit transformation.
+
+      The mask operand masks out memory access so that it is safe to pass out-of-boundary
+      addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+    Example 1:
+    ```mlir
+      %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
+                              l2_hint = #xegpu.cache_hint<uncached>,
+                              l3_hint = #xegpu.cache_hint<uncached>}
+            : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+              vector<16xi1> -> vector<16xf32>
+    ```
 
-  Example 2:
-  ```mlir
-    %2 = xegpu.load %1, %0 {transpose,
-                            l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
-          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
-            vector<16xi1> -> vector<8x16xf32>
-  ```
+    Example 2:
+    ```mlir
+      %2 = xegpu.load %1, %0 {transpose,
+                              l1_hint = #xegpu.cache_hint<cached>,
+                              l2_hint = #xegpu.cache_hint<uncached>,
+                              l3_hint = #xegpu.cache_hint<uncached>}
+            : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
+              vector<16xi1> -> vector<8x16xf32>
+    ```
 
-  }];
+    }];
 
-  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                       XeGPU_MaskType: $mask,
-                       OptionalAttr<UnitAttr>: $transpose,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-  let results = (outs XeGPU_ValueType: $value);
+    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                        XeGPU_MaskType: $mask,
+                        OptionalAttr<UnitAttr>: $transpose,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+    let results = (outs XeGPU_ValueType: $value);
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
 
-    mlir::Type getElementType() {
-      auto type = getValue().getType();
-      return getElementTypeOrSelf(type);
-    }
+      mlir::Type getElementType() {
+        auto type = getValue().getType();
+        return getElementTypeOrSelf(type);
+      }
 
-    Type getValueType() {
-      return getValue().getType();
-    }
+      Type getValueType() {
+        return getValue().getType();
+      }
 
-    Type getMaskType() {
-      return getMask().getType();
-    }
+      Type getMaskType() {
+        return getMask().getType();
+      }
 
-  }];
+    }];
 
-  let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
-      `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+    let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+        `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
 
-  let hasVerifier = 1;
+    let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "store data to scattered memory locations.";
-  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
-  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
-  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
-  and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
-  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
-  introduced on purpose, making sure users are aware of this implicit transformation.
-
-  Example 1:
-  ```mlir
-    %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
-  ```
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", 
+  [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "store data to scattered memory locations.";
+    let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+    typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
+    a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
+    and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
+    has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
+    introduced on purpose, making sure users are aware of this implicit transformation.
+
+    Example 1:
+    ```mlir
+      %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                                  l2_hint = #xegpu.cache_hint<write_back>,
+                                  l3_hint = #xegpu.cache_hint<write_through>}
+            : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+    ```
 
-  Example 2:
-  ```mlir
-    %3 = xegpu.store %0, %1, %2 {transpose,
-                                 l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
-  ```
+    Example 2:
+    ```mlir
+      %3 = xegpu.store %0, %1, %2 {transpose,
+                                  l1_hint = #xegpu.cache_hint<uncached>,
+                                  l2_hint = #xegpu.cache_hint<write_back>,
+                                  l3_hint = #xegpu.cache_hint<write_through>}
+            : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
+    ```
 
-  }];
+    }];
 
-  let arguments = (ins
-    XeGPU_ValueType: $value,
-    XeGPU_TensorDesc: $TensorDesc,
-    XeGPU_MaskType: $mask,
-    OptionalAttr<UnitAttr>: $transpose,
-    OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-    OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-    OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+    let arguments = (ins
+      XeGPU_ValueType: $value,
+      XeGPU_TensorDesc: $TensorDesc,
+      XeGPU_MaskType: $mask,
+      OptionalAttr<UnitAttr>: $transpose,
+      OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+      OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+      OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
 
-    Type getValueType() {
-      return getValue().getType();
-    }
+      Type getValueType() {
+        return getValue().getType();
+      }
 
-    Type getMaskType() {
-      return getMask().getType();
-    }
-  }];
+      Type getMaskType() {
+        return getMask().getType();
+      }
+    }];
 
-  let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
-            `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+    let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+              `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
 
-  let hasVerifier = 1;
+    let hasVerifier = 1;
 }
 
 def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index cd883baa986b85..d015e5772a94f2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -324,6 +324,12 @@ LogicalResult LoadNdOp::verify() {
   return success();
 }
 
+void LoadNdOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
@@ -361,6 +367,12 @@ LogicalResult StoreNdOp::verify() {
   return success();
 }
 
+void StoreNdOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Write::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateNDOffsetOp
 //===----------------------------------------------------------------------===//
@@ -494,7 +506,7 @@ LogicalResult PrefetchOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_LoadGatherOp
+// XeGPU_jrOp
 //===----------------------------------------------------------------------===//
 LogicalResult LoadGatherOp::verify() {
   auto tdescTy = getTensorDescType();
@@ -553,6 +565,12 @@ LogicalResult LoadGatherOp::verify() {
   return success();
 }
 
+void LoadGatherOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreScatterOp
 //===----------------------------------------------------------------------===//
@@ -605,6 +623,12 @@ LogicalResult StoreScatterOp::verify() {
   return success();
 }
 
+void StoreScatterOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Write::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//

>From 1be0ae30b85e6270756e41fbc939edc3f42ba7e9 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 31 Jan 2025 23:18:30 +0000
Subject: [PATCH 2/3] add mem side effects interface

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 390 +++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |   2 +-
 2 files changed, 194 insertions(+), 198 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d98aa9ffb26f1a..0ff723005d4359 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -277,102 +277,101 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 
 
 def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
   ]> {
-    let summary = "loads a n-D block from memory (represented by TensorDesc)"
-                  "to registers (represented by vector)";
-    let description = [{
-      LoadNdOp essentially mimics the hardware block read instruction to read
-      a block of data from memory to register. It takes a set of optional cache
-      hints for each level of cache, L1, L2 and L3. If hardware does not have a
-      correspoding cache, Corresponding cache hint attribute will be masked.
-      VNNI transformation is an hardware feature for Intel GPU, which is used to
-      do data packing during the load for B operand of matrix operation, if
-      the bit width of the data type is less then 32 bits, e.g., fp16. And
-      transpose is another Intel hardware feature, which will do transpose
-      operation when loading the data if the bit width of the data type is
-      fp32 or fp64. It implies that vnni and transpose cannot exit at the
-      same time.
-
-      Example:
-      ```mlir
-        xegpu.load_nd %1 {transpose = [1, 0],
-                          l1_hint = #xegpu.cache_hint<cached>,
-                          l2_hint = #xegpu.cache_hint<uncached>,
-                          l3_hint = #xegpu.cache_hint<streaming>}
-                : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
-      ```
-
-
-    }];
-
-    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                        OptionalAttr<UnitAttr>: $packed,
-                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-    let results = (outs XeGPU_ValueType: $value);
-
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      VectorType getType() {
-        return llvm::dyn_cast<VectorType>(getValue().getType());
-      }
+  let summary = "loads a n-D block from memory (represented by TensorDesc)"
+                "to registers (represented by vector)";
+  let description = [{
+    LoadNdOp essentially mimics the hardware block read instruction to read
+    a block of data from memory to register. It takes a set of optional cache
+    hints for each level of cache, L1, L2 and L3. If hardware does not have a
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    VNNI transformation is an hardware feature for Intel GPU, which is used to
+    do data packing during the load for B operand of matrix operation, if
+    the bit width of the data type is less then 32 bits, e.g., fp16. And
+    transpose is another Intel hardware feature, which will do transpose
+    operation when loading the data if the bit width of the data type is
+    fp32 or fp64. It implies that vnni and transpose cannot exit at the
+    same time.
 
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
-    }];
+    Example:
+    ```mlir
+      xegpu.load_nd %1 {transpose = [1, 0],
+                        l1_hint = #xegpu.cache_hint<cached>,
+                        l2_hint = #xegpu.cache_hint<uncached>,
+                        l3_hint = #xegpu.cache_hint<streaming>}
+              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+    ```
+
+
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<UnitAttr>: $packed,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
 
-    let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
-    let hasVerifier = 1;
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+  let hasVerifier = 1;
 }
 
 def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
   ]> {
-    let summary = "stores a n-D block register region back to memory, currently only supports 2D";
-
-    let description = [{
-      StoreNdOp essentially mimics the hardware block write instruction io
-      write a block of data from register into the memory region as described
-      by the TensorDesc. It takes a set of optional cache hints for each level
-      of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
-      Corresponding cache hint attribute will be masked.
-
-      Example:
-      ```mlir
-        xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                              l2_hint = #xegpu.cache_hint<write_back>,
-                              l3_hint = #xegpu.cache_hint<write_through>}
-                              : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
-      ```
-
-
-    }];
-
-    let arguments = (ins XeGPU_ValueType: $value,
-                        XeGPU_TensorDesc: $TensorDesc,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      VectorType getValueType() {
-        return llvm::dyn_cast<VectorType>(getValue().getType());
-      }
+  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
-    }];
+  let description = [{
+    StoreNdOp essentially mimics the hardware block write instruction io
+    write a block of data from register into the memory region as described
+    by the TensorDesc. It takes a set of optional cache hints for each level
+    of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+    Corresponding cache hint attribute will be masked.
+
+    Example:
+    ```mlir
+      xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>,
+                             l3_hint = #xegpu.cache_hint<write_through>}
+                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+    ```
+
+
+  }];
+
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getValueType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
 
-    let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
-                          `:` type($value) `,` qualified(type($TensorDesc))}];
-    let hasVerifier = 1;
+  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+                        `:` type($value) `,` qualified(type($TensorDesc))}];
+  let hasVerifier = 1;
 }
 
 def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
@@ -555,137 +554,134 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
 }
 
 def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
   ]> {
-    let summary = "load a set of scattered data points from memory.";
-
-    let description = [{ It (aka. load) load data per each work-item. The output
-      describes the data being loaded at the subgroup level, so its size is
-      consistent with the number of work-items in a subgroup. When the chunk size
-      is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-      to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
-      Specially, there is a transpose effect on the result (as compared to the TensorDesc)
-      due to the hardware implementation. Therefore, a transpose attribute is introduced
-      on purpose, making sure users are aware of this implicit transformation.
-
-      The mask operand masks out memory access so that it is safe to pass out-of-boundary
-      addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
-
-    Example 1:
-    ```mlir
-      %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
-                              l2_hint = #xegpu.cache_hint<uncached>,
-                              l3_hint = #xegpu.cache_hint<uncached>}
-            : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
-              vector<16xi1> -> vector<16xf32>
-    ```
+  let summary = "load a set of scattered data points from memory.";
+
+  let description = [{ It (aka. load) load data per each work-item. The output
+    describes the data being loaded at the subgroup level, so its size is
+    consistent with the number of work-items in a subgroup. When the chunk size
+    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
+    to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
+    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
+    due to the hardware implementation. Therefore, a transpose attribute is introduced
+    on purpose, making sure users are aware of this implicit transformation.
+
+    The mask operand masks out memory access so that it is safe to pass out-of-boundary
+    addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+  Example 1:
+  ```mlir
+    %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
+                            l2_hint = #xegpu.cache_hint<uncached>,
+                            l3_hint = #xegpu.cache_hint<uncached>}
+          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+            vector<16xi1> -> vector<16xf32>
+  ```
 
-    Example 2:
-    ```mlir
-      %2 = xegpu.load %1, %0 {transpose,
-                              l1_hint = #xegpu.cache_hint<cached>,
-                              l2_hint = #xegpu.cache_hint<uncached>,
-                              l3_hint = #xegpu.cache_hint<uncached>}
-            : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
-              vector<16xi1> -> vector<8x16xf32>
-    ```
+  Example 2:
+  ```mlir
+    %2 = xegpu.load %1, %0 {transpose,
+                            l1_hint = #xegpu.cache_hint<cached>,
+                            l2_hint = #xegpu.cache_hint<uncached>,
+                            l3_hint = #xegpu.cache_hint<uncached>}
+          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
+            vector<16xi1> -> vector<8x16xf32>
+  ```
 
-    }];
+  }];
 
-    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                        XeGPU_MaskType: $mask,
-                        OptionalAttr<UnitAttr>: $transpose,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-    let results = (outs XeGPU_ValueType: $value);
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       XeGPU_MaskType: $mask,
+                       OptionalAttr<UnitAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+  let results = (outs XeGPU_ValueType: $value);
 
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
 
-      mlir::Type getElementType() {
-        auto type = getValue().getType();
-        return getElementTypeOrSelf(type);
-      }
+    mlir::Type getElementType() {
+      auto type = getValue().getType();
+      return getElementTypeOrSelf(type);
+    }
 
-      Type getValueType() {
-        return getValue().getType();
-      }
+    Type getValueType() {
+      return getValue().getType();
+    }
 
-      Type getMaskType() {
-        return getMask().getType();
-      }
+    Type getMaskType() {
+      return getMask().getType();
+    }
 
-    }];
+  }];
 
-    let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
-        `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+  let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+      `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
 
-    let hasVerifier = 1;
+  let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", 
-  [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
-  ]> {
-    let summary = "store data to scattered memory locations.";
-    let description = [{ It (aka. store) stores data to scattered memory locations. The value is
-    typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
-    a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
-    and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
-    has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
-    introduced on purpose, making sure users are aware of this implicit transformation.
-
-    Example 1:
-    ```mlir
-      %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                                  l2_hint = #xegpu.cache_hint<write_back>,
-                                  l3_hint = #xegpu.cache_hint<write_through>}
-            : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
-    ```
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
+  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let summary = "store data to scattered memory locations.";
+  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
+  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
+  and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
+  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
+  introduced on purpose, making sure users are aware of this implicit transformation.
+
+  Example 1:
+  ```mlir
+    %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                                 l2_hint = #xegpu.cache_hint<write_back>,
+                                 l3_hint = #xegpu.cache_hint<write_through>}
+          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+  ```
 
-    Example 2:
-    ```mlir
-      %3 = xegpu.store %0, %1, %2 {transpose,
-                                  l1_hint = #xegpu.cache_hint<uncached>,
-                                  l2_hint = #xegpu.cache_hint<write_back>,
-                                  l3_hint = #xegpu.cache_hint<write_through>}
-            : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
-    ```
+  Example 2:
+  ```mlir
+    %3 = xegpu.store %0, %1, %2 {transpose,
+                                 l1_hint = #xegpu.cache_hint<uncached>,
+                                 l2_hint = #xegpu.cache_hint<write_back>,
+                                 l3_hint = #xegpu.cache_hint<write_through>}
+          : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
+  ```
 
-    }];
+  }];
 
-    let arguments = (ins
-      XeGPU_ValueType: $value,
-      XeGPU_TensorDesc: $TensorDesc,
-      XeGPU_MaskType: $mask,
-      OptionalAttr<UnitAttr>: $transpose,
-      OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+  let arguments = (ins
+    XeGPU_ValueType: $value,
+    XeGPU_TensorDesc: $TensorDesc,
+    XeGPU_MaskType: $mask,
+    OptionalAttr<UnitAttr>: $transpose,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
 
-      Type getValueType() {
-        return getValue().getType();
-      }
+    Type getValueType() {
+      return getValue().getType();
+    }
 
-      Type getMaskType() {
-        return getMask().getType();
-      }
-    }];
+    Type getMaskType() {
+      return getMask().getType();
+    }
+  }];
 
-    let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
-              `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+  let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+            `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
 
-    let hasVerifier = 1;
+  let hasVerifier = 1;
 }
 
 def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index d015e5772a94f2..443a1347334e23 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -506,7 +506,7 @@ LogicalResult PrefetchOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_jrOp
+// XeGPU_LoadGatherOp
 //===----------------------------------------------------------------------===//
 LogicalResult LoadGatherOp::verify() {
   auto tdescTy = getTensorDescType();

>From ffae0295ea78ce2ae00af83227063edeb04e4f20 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 3 Feb 2025 16:48:54 +0000
Subject: [PATCH 3/3] add mem side effects interface

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 11 ++++-----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 24 -------------------
 2 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 0ff723005d4359..7560ede058faa3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -277,8 +277,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 
 
 def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
-  AllElementTypesMatch<["value", "TensorDesc"]>,
-  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>
   ]> {
   let summary = "loads a n-D block from memory (represented by TensorDesc)"
                 "to registers (represented by vector)";
@@ -331,7 +330,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
 }
 
 def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
-  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>
   ]> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
@@ -554,8 +553,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
 }
 
 def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
-  AllElementTypesMatch<["value", "TensorDesc"]>,
-  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>
   ]> {
   let summary = "load a set of scattered data points from memory.";
 
@@ -627,7 +625,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
 }
 
 def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
-  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>
+  ]> {
   let summary = "store data to scattered memory locations.";
   let description = [{ It (aka. store) stores data to scattered memory locations. The value is
   typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 443a1347334e23..cd883baa986b85 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -324,12 +324,6 @@ LogicalResult LoadNdOp::verify() {
   return success();
 }
 
-void LoadNdOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Read::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
@@ -367,12 +361,6 @@ LogicalResult StoreNdOp::verify() {
   return success();
 }
 
-void StoreNdOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Write::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateNDOffsetOp
 //===----------------------------------------------------------------------===//
@@ -565,12 +553,6 @@ LogicalResult LoadGatherOp::verify() {
   return success();
 }
 
-void LoadGatherOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Read::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreScatterOp
 //===----------------------------------------------------------------------===//
@@ -623,12 +605,6 @@ LogicalResult StoreScatterOp::verify() {
   return success();
 }
 
-void StoreScatterOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Write::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//