[Mlir-commits] [mlir] [MLIR][XeGPU][Draft] XeGPU distribution patterns for load_nd, store_nd, and create_nd_tdesc. (PR #119783)

Tue Feb 4 12:36:15 PST 2025

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/119783

>From dab68412dc7b133baec0b2ec17428f3378c65cc8 Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov at intel.com>
Date: Mon, 9 Dec 2024 20:25:50 +0000
Subject: [PATCH 01/15] [MLIR] Create GPU utils library & move distribution
 utils

---
 .../mlir/Conversion/GPUCommon/GPUCommonPass.h |   2 +-
 .../mlir/Dialect/GPU/Transforms/Passes.h      |   2 +-
 .../Dialect/GPU/Utils/DistributionUtils.h     |  57 +++++++
 .../{Transforms/Utils.h => Utils/GPUUtils.h}  |   0
 mlir/lib/Dialect/GPU/CMakeLists.txt           |   3 +-
 .../GPU/Transforms/AsyncRegionRewriter.cpp    |   2 +-
 .../GPU/Transforms/KernelOutlining.cpp        |   2 +-
 .../GPU/Transforms/SubgroupReduceLowering.cpp |   2 +-
 mlir/lib/Dialect/GPU/Utils/CMakeLists.txt     |  14 ++
 .../Dialect/GPU/Utils/DistributionUtils.cpp   | 149 ++++++++++++++++++
 .../GPU/{Transforms => Utils}/Utils.cpp       |   2 +-
 .../Dialect/Vector/Transforms/CMakeLists.txt  |   1 +
 .../Vector/Transforms/VectorDistribute.cpp    | 139 +---------------
 13 files changed, 230 insertions(+), 145 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
 rename mlir/include/mlir/Dialect/GPU/{Transforms/Utils.h => Utils/GPUUtils.h} (100%)
 create mode 100644 mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
 create mode 100644 mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
 rename mlir/lib/Dialect/GPU/{Transforms => Utils}/Utils.cpp (96%)

diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 5f40315a849094d..094360e75ab617a 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -8,7 +8,7 @@
 #ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
 #define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
 
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 8eb711962583dab..eb51d477e23f86f 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -13,7 +13,7 @@
 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
 #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
 
-#include "Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
new file mode 100644
index 000000000000000..6efd23269719823
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
@@ -0,0 +1,57 @@
+//===- VectorDistributionUtils.h - Distribution Utilities -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
+#define MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBITIONUTILS_H_
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/PatternMatch.h"
+
+#include <utility>
+
+namespace mlir {
+namespace gpu {
+/// Return a value yielded by `warpOp` which statifies the filter lamdba
+/// condition and is not dead.
+OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
+                         const std::function<bool(Operation *)> &fn);
+
+/// Helper to create a new WarpExecuteOnLane0Op with different signature.
+WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
+    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+    ValueRange newYieldedValues, TypeRange newReturnTypes);
+
+/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
+/// `indices` return the index of each new output.
+WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
+    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+    ValueRange newYieldedValues, TypeRange newReturnTypes,
+    llvm::SmallVector<size_t> &indices);
+
+/// Helper to know if an op can be hoisted out of the region.
+bool canBeHoisted(Operation *op, function_ref<bool(Value)> definedOutside);
+
+/// Return a value yielded by `warpOp` which statifies the filter lamdba
+/// condition and is not dead.
+OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
+                         const std::function<bool(Operation *)> &fn);
+
+/// Delinearize the given `laneId` into multiple dimensions, where each
+/// dimension's size is determined by `originalShape` and `distributedShape`
+/// together. This function expects the total numbers of threads needed for
+/// distribution is equal to `warpSize`. Returns true and updates
+/// `delinearizedIds` if so.
+bool delinearizeLaneId(OpBuilder &builder, Location loc,
+                       ArrayRef<int64_t> originalShape,
+                       ArrayRef<int64_t> distributedShape, int64_t warpSize,
+                       Value laneId, SmallVectorImpl<Value> &delinearizedIds);
+
+} // namespace gpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
similarity index 100%
rename from mlir/include/mlir/Dialect/GPU/Transforms/Utils.h
rename to mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index a59645480aba219..1026e9b509332af 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -40,7 +40,6 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/ShuffleRewriter.cpp
   Transforms/SPIRVAttachTarget.cpp
   Transforms/SubgroupReduceLowering.cpp
-  Transforms/Utils.cpp
   
   OBJECT
 
@@ -59,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   MLIRDataLayoutInterfaces
   MLIRExecutionEngineUtils
   MLIRGPUDialect
+  MLIRGPUUtils
   MLIRIR
   MLIRIndexDialect
   MLIRLLVMDialect
@@ -76,3 +76,4 @@ add_mlir_dialect_library(MLIRGPUTransforms
 
 add_subdirectory(TransformOps)
 add_subdirectory(Pipelines)
+add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index b2fa3a99c53fc37..41a5e39e55064e8 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -16,7 +16,7 @@
 #include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index ba0c80c50211e3a..a6a36848b5635d1 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -18,7 +18,7 @@
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 185f824351a2309..43eff3eddcc4915 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -13,7 +13,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
new file mode 100644
index 000000000000000..69094c518a159e5
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(MLIRGPUUtils
+  Utils.cpp
+  DistributionUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRAffineDialect
+  MLIRGPUDialect
+  MLIRSupport
+  MLIRIR
+  )
diff --git a/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
new file mode 100644
index 000000000000000..c6e8e03350bbce3
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
@@ -0,0 +1,149 @@
+//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements distribution utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/Value.h"
+
+#include <numeric>
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+WarpExecuteOnLane0Op mlir::gpu::moveRegionToNewWarpOpAndReplaceReturns(
+    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+    ValueRange newYieldedValues, TypeRange newReturnTypes) {
+  // Create a new op before the existing one, with the extra operands.
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(warpOp);
+  auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
+      warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
+      warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
+
+  Region &opBody = warpOp.getBodyRegion();
+  Region &newOpBody = newWarpOp.getBodyRegion();
+  Block &newOpFirstBlock = newOpBody.front();
+  rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
+  rewriter.eraseBlock(&newOpFirstBlock);
+  assert(newWarpOp.getWarpRegion().hasOneBlock() &&
+         "expected WarpOp with single block");
+
+  auto yield =
+      cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
+
+  rewriter.modifyOpInPlace(
+      yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
+  return newWarpOp;
+}
+
+WarpExecuteOnLane0Op mlir::gpu::moveRegionToNewWarpOpAndAppendReturns(
+    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+    ValueRange newYieldedValues, TypeRange newReturnTypes,
+    llvm::SmallVector<size_t> &indices) {
+  SmallVector<Type> types(warpOp.getResultTypes().begin(),
+                          warpOp.getResultTypes().end());
+  auto yield = cast<gpu::YieldOp>(
+      warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+  llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
+                                              yield.getOperands().end());
+  for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
+    if (yieldValues.insert(std::get<0>(newRet))) {
+      types.push_back(std::get<1>(newRet));
+      indices.push_back(yieldValues.size() - 1);
+    } else {
+      // If the value already exit the region don't create a new output.
+      for (auto [idx, yieldOperand] :
+           llvm::enumerate(yieldValues.getArrayRef())) {
+        if (yieldOperand == std::get<0>(newRet)) {
+          indices.push_back(idx);
+          break;
+        }
+      }
+    }
+  }
+  yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
+  WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
+      rewriter, warpOp, yieldValues.getArrayRef(), types);
+  rewriter.replaceOp(warpOp,
+                     newWarpOp.getResults().take_front(warpOp.getNumResults()));
+  return newWarpOp;
+}
+
+bool mlir::gpu::canBeHoisted(Operation *op,
+                             function_ref<bool(Value)> definedOutside) {
+  return llvm::all_of(op->getOperands(), definedOutside) &&
+         isMemoryEffectFree(op) && op->getNumRegions() == 0;
+}
+
+OpOperand *
+mlir::gpu::getWarpResult(WarpExecuteOnLane0Op warpOp,
+                         const std::function<bool(Operation *)> &fn) {
+  auto yield = cast<gpu::YieldOp>(
+      warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+  for (OpOperand &yieldOperand : yield->getOpOperands()) {
+    Value yieldValues = yieldOperand.get();
+    Operation *definedOp = yieldValues.getDefiningOp();
+    if (definedOp && fn(definedOp)) {
+      if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
+        return &yieldOperand;
+    }
+  }
+  return {};
+}
+
+bool mlir::gpu::delinearizeLaneId(OpBuilder &builder, Location loc,
+                                  ArrayRef<int64_t> originalShape,
+                                  ArrayRef<int64_t> distributedShape,
+                                  int64_t warpSize, Value laneId,
+                                  SmallVectorImpl<Value> &delinearizedIds) {
+  // If the original shape and the distributed shape is the same, we don't
+  // distribute at all--every thread is handling the whole. For such case, we
+  // should not rely on lane IDs later. So just return an empty lane ID vector.
+  if (originalShape == distributedShape) {
+    delinearizedIds.clear();
+    return true;
+  }
+
+  SmallVector<int64_t> sizes;
+  for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
+    if (large % small != 0)
+      return false;
+    sizes.push_back(large / small);
+  }
+  if (std::accumulate(sizes.begin(), sizes.end(), 1,
+                      std::multiplies<int64_t>()) != warpSize)
+    return false;
+
+  AffineExpr s0, s1;
+  bindSymbols(builder.getContext(), s0, s1);
+
+  int64_t usedThreads = 1;
+
+  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  delinearizedIds.assign(sizes.size(), zero);
+
+  for (int i = sizes.size() - 1; i >= 0; --i) {
+    usedThreads *= sizes[i];
+    if (usedThreads == warpSize) {
+      // We've used up all available threads. Don't need to perform modulo
+      // anymore. And we can stop the calculation for further dimensions.
+      delinearizedIds[i] = laneId;
+      break;
+    }
+    delinearizedIds[i] =
+        affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
+    laneId = affine::makeComposedAffineApply(
+        builder, loc, s0.floorDiv(usedThreads), {laneId});
+  }
+  return true;
+}
diff --git a/mlir/lib/Dialect/GPU/Transforms/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
similarity index 96%
rename from mlir/lib/Dialect/GPU/Transforms/Utils.cpp
rename to mlir/lib/Dialect/GPU/Utils/Utils.cpp
index e91aa18128c7b93..1f09875b3e2732b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace mlir::gpu {
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index 9a3bd5d4593d637..8ca5cb6c6dfabcf 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -38,6 +38,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   MLIRArithDialect
   MLIRDialectUtils
   MLIRGPUDialect
+  MLIRGPUUtils
   MLIRIR
   MLIRLinalgDialect
   MLIRMemRefDialect
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 3e1425983699511..d080b0b0bd44bd2 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -18,7 +19,6 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/FormatVariadic.h"
-#include <numeric>
 #include <utility>
 
 using namespace mlir;
@@ -162,92 +162,6 @@ struct DistributedLoadStoreHelper {
 
 } // namespace
 
-/// Helper to create a new WarpExecuteOnLane0Op with different signature.
-static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
-    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
-    ValueRange newYieldedValues, TypeRange newReturnTypes) {
-  // Create a new op before the existing one, with the extra operands.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(warpOp);
-  auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
-      warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
-      warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
-
-  Region &opBody = warpOp.getBodyRegion();
-  Region &newOpBody = newWarpOp.getBodyRegion();
-  Block &newOpFirstBlock = newOpBody.front();
-  rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
-  rewriter.eraseBlock(&newOpFirstBlock);
-  assert(newWarpOp.getWarpRegion().hasOneBlock() &&
-         "expected WarpOp with single block");
-
-  auto yield =
-      cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
-
-  rewriter.modifyOpInPlace(
-      yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
-  return newWarpOp;
-}
-
-/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
-/// `indices` return the index of each new output.
-static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
-    RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
-    ValueRange newYieldedValues, TypeRange newReturnTypes,
-    llvm::SmallVector<size_t> &indices) {
-  SmallVector<Type> types(warpOp.getResultTypes().begin(),
-                          warpOp.getResultTypes().end());
-  auto yield = cast<gpu::YieldOp>(
-      warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
-  llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
-                                              yield.getOperands().end());
-  for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
-    if (yieldValues.insert(std::get<0>(newRet))) {
-      types.push_back(std::get<1>(newRet));
-      indices.push_back(yieldValues.size() - 1);
-    } else {
-      // If the value already exit the region don't create a new output.
-      for (auto [idx, yieldOperand] :
-           llvm::enumerate(yieldValues.getArrayRef())) {
-        if (yieldOperand == std::get<0>(newRet)) {
-          indices.push_back(idx);
-          break;
-        }
-      }
-    }
-  }
-  yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
-  WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
-      rewriter, warpOp, yieldValues.getArrayRef(), types);
-  rewriter.replaceOp(warpOp,
-                     newWarpOp.getResults().take_front(warpOp.getNumResults()));
-  return newWarpOp;
-}
-
-/// Helper to know if an op can be hoisted out of the region.
-static bool canBeHoisted(Operation *op,
-                         function_ref<bool(Value)> definedOutside) {
-  return llvm::all_of(op->getOperands(), definedOutside) &&
-         isMemoryEffectFree(op) && op->getNumRegions() == 0;
-}
-
-/// Return a value yielded by `warpOp` which statifies the filter lamdba
-/// condition and is not dead.
-static OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
-                                const std::function<bool(Operation *)> &fn) {
-  auto yield = cast<gpu::YieldOp>(
-      warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
-  for (OpOperand &yieldOperand : yield->getOpOperands()) {
-    Value yieldValues = yieldOperand.get();
-    Operation *definedOp = yieldValues.getDefiningOp();
-    if (definedOp && fn(definedOp)) {
-      if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
-        return &yieldOperand;
-    }
-  }
-  return {};
-}
-
 // Clones `op` into a new operation that takes `operands` and returns
 // `resultTypes`.
 static Operation *cloneOpWithOperandsAndTypes(RewriterBase &rewriter,
@@ -770,57 +684,6 @@ struct WarpOpConstant : public OpRewritePattern<WarpExecuteOnLane0Op> {
   }
 };
 
-/// Delinearize the given `laneId` into multiple dimensions, where each
-/// dimension's size is determined by `originalShape` and `distributedShape`
-/// together. This function expects the total numbers of threads needed for
-/// distribution is equal to `warpSize`. Returns true and updates
-/// `delinearizedIds` if so.
-bool delinearizeLaneId(OpBuilder &builder, Location loc,
-                       ArrayRef<int64_t> originalShape,
-                       ArrayRef<int64_t> distributedShape, int64_t warpSize,
-                       Value laneId, SmallVectorImpl<Value> &delinearizedIds) {
-  // If the original shape and the distributed shape is the same, we don't
-  // distribute at all--every thread is handling the whole. For such case, we
-  // should not rely on lane IDs later. So just return an empty lane ID vector.
-  if (originalShape == distributedShape) {
-    delinearizedIds.clear();
-    return true;
-  }
-
-  SmallVector<int64_t> sizes;
-  for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
-    if (large % small != 0)
-      return false;
-    sizes.push_back(large / small);
-  }
-  if (std::accumulate(sizes.begin(), sizes.end(), 1,
-                      std::multiplies<int64_t>()) != warpSize)
-    return false;
-
-  AffineExpr s0, s1;
-  bindSymbols(builder.getContext(), s0, s1);
-
-  int64_t usedThreads = 1;
-
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  delinearizedIds.assign(sizes.size(), zero);
-
-  for (int i = sizes.size() - 1; i >= 0; --i) {
-    usedThreads *= sizes[i];
-    if (usedThreads == warpSize) {
-      // We've used up all available threads. Don't need to perform modulo
-      // anymore. And we can stop the calculation for further dimensions.
-      delinearizedIds[i] = laneId;
-      break;
-    }
-    delinearizedIds[i] =
-        affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
-    laneId = affine::makeComposedAffineApply(
-        builder, loc, s0.floorDiv(usedThreads), {laneId});
-  }
-  return true;
-}
-
 /// Sink out transfer_read op feeding into a warp op yield.
 /// ```
 /// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {

>From f6cd50a7028b806824194baf3cd231e5342f12dc Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 19:39:28 +0000
Subject: [PATCH 02/15] pass added

---
 .../Dialect/XeGPU/Transforms/Transforms.h     |   1 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  10 +-
 .../Dialect/XeGPU/Transforms/CMakeLists.txt   |   5 +
 .../XeGPU/Transforms/XeGPUDistribute.cpp      | 393 ++++++++++++++++++
 mlir/test/Dialect/XeGPU/xegpu-distribute.mlir |  81 ++++
 mlir/test/lib/Dialect/CMakeLists.txt          |   1 +
 mlir/test/lib/Dialect/XeGPU/CMakeLists.txt    |  16 +
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  58 +++
 mlir/tools/mlir-opt/CMakeLists.txt            |   1 +
 mlir/tools/mlir-opt/mlir-opt.cpp              |   2 +
 10 files changed, 563 insertions(+), 5 deletions(-)
 create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
 create mode 100644 mlir/test/lib/Dialect/XeGPU/CMakeLists.txt
 create mode 100644 mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 63ea26df0693722..fe5198d1ac6dba3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -16,6 +16,7 @@ namespace xegpu {
 
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
+void populateXeGPUDistributePatterns(RewritePatternSet &patterns);
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 9d3c4366a7bd507..f0e67df3ee60693 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -281,11 +281,11 @@ LogicalResult LoadNdOp::verify() {
     tdescShape.insert(it, array_len);
   }
 
-  if (tdescShape != valueShape)
-    return emitOpError() << "Result shape doesn't match TensorDesc shape."
-                         << "The expected shape is " << makeString(tdescShape)
-                         << ". But the given shape is "
-                         << makeString(valueShape) << ".\n";
+  // if (tdescShape != valueShape)
+  //   return emitOpError() << "Result shape doesn't match TensorDesc shape."
+  //                        << "The expected shape is " << makeString(tdescShape)
+  //                        << ". But the given shape is "
+  //                        << makeString(valueShape) << ".\n";
   return success();
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 7fb64d3b97b87d5..148ff46ba41b724 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUFoldAliasOps.cpp
+  XeGPUDistribute.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
@@ -12,6 +13,10 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   MLIRIR
   MLIRMemRefDialect
   MLIRXeGPUDialect
+  MLIRVectorDialect
+  MLIRVectorUtils
+  MLIRArithDialect
+  MLIRFuncDialect
   MLIRPass
   MLIRTransforms
 )
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
new file mode 100644
index 000000000000000..52c953697d0f346
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -0,0 +1,393 @@
+//===- XeGPUDistribute.cpp - XeGPU ditribute ops to work items --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "xegpu-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
+
+// /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
+// /// The warp op will still contain the original op that will not be used by the
+// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// the create_nd_tdesc's arguments.
+// /// The rewrite will create a subview of the size used by a single work item and
+// /// appropriate offset. The distributed create_nd_tdesc points into the subview
+// /// without offset. The tensor descriptor types is distributed according to
+// /// sg_map attribute.
+// ///
+// /// Example:
+// ///
+// /// ```
+// ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+// ///   %r = vector.warp_execute_on_lane_0(%laneid) ->
+// ///                   (!xegpu.tensor_desc<4x8xf32>) {
+// ///     ...
+// ///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
+// ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+// ///     vector.yield %td
+// ///   }
+// /// ```
+// /// To
+// /// ```
+// ///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+// ///     ...
+// ///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+// ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+// ///     vector.yield %arg0, %dead
+// ///   }
+// ///   %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
+// ///                               : memref<4x8xf32> to memref<4x1xf32>
+// ///   %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
+// ///                                 -> !xegpu.tensor_desc<4x1xf32>
+// ///
+// /// ```
+// struct WarpOpTensorDescOp final
+//     : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+//   using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+//   LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+//                                 PatternRewriter &rewriter) const override;
+// };
+
+// /// Sink a store_nd feeding into vector.yield op for the enclosing
+// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are passed
+// /// through the warp op interface they would be propagated as returned values.
+// /// Both the stored vector type and tensor descriptor types are distributed
+// /// according to sg_map attribute.
+// ///
+// /// Example:
+// ///
+// /// ```
+// ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+// ///   vector.warp_execute_on_lane_0(%laneid) -> () {
+// ///     ...
+// ///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+// ///                                 !xegpu.tensor_desc<4x8xf32>
+// ///     vector.yield
+// ///   }
+// /// ```
+// /// To
+// /// ```
+// ///   %r = vector.warp_execute_on_lane_0(%laneid) -> () {
+// ///     ...
+// ///     vector.yield
+// ///   }
+// ///   xegpu.store_nd %arg0, %arg1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
+// ///
+// /// ```
+// struct WarpOpStoreNd final
+//     : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+//   using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+//   LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+//                                 PatternRewriter &rewriter) const override;
+// };
+
+// /// Clone a load_nd feeding into vector.yield op for the enclosing
+// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
+// /// The warp op will still contain the original op that will not be used by the
+// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// the load's arguments.
+// /// Both the loaded vector type and tensor descriptor types are distributed
+// /// according to sg_map attribute.
+// ///
+// /// Example:
+// ///
+// /// ```
+// ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+// ///   %r = vector.warp_execute_on_lane_0(%laneid) ->
+// ///                   (!xegpu.tensor_desc<4x8xf32>) {
+// ///     ...
+// ///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
+// ///     vector<4x8xf32> vector.yield %ld
+// ///   }
+// /// ```
+// /// To
+// /// ```
+// ///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+// ///     ...
+// ///     %dead = xegpu.load_nd %arg0, %arg1:
+// ///         !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
+// ///     vector.yield %arg0, %arg1
+// ///   }
+// ///   xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
+// ///
+// /// ```
+// struct WarpOpLoadNd final
+//     : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+//   using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+//   LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+//                                 PatternRewriter &rewriter) const override;
+// };
+
+// FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
+//                                                xegpu::SGMapAttr sgMap) {
+//   llvm::SmallVector<int64_t, 2> distributedShape;
+//   auto layout = sgMap.getWiLayout();
+//   auto shape = originalT.getShape();
+//   for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+//     if (!divisible(APInt(64, o), APInt(64, l)))
+//       return failure();
+//     distributedShape.push_back(o / l);
+//   }
+//   auto newVectorType =
+//       VectorType::get(distributedShape, originalT.getElementType(),
+//                       originalT.getScalableDims());
+//   return newVectorType;
+// }
+
+// FailureOr<xegpu::TensorDescType>
+// getDistributedTensorDescType(xegpu::TensorDescType originalT,
+//                              xegpu::SGMapAttr sgMap,
+//                              xegpu::MemorySpace memSpace) {
+//   llvm::SmallVector<int64_t, 2> distributedShape;
+//   auto layout = sgMap.getWiLayout();
+//   auto shape = originalT.getShape();
+//   for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+//     if (!divisible(APInt(64, o), APInt(64, l)))
+//       return failure();
+//     distributedShape.push_back(o / l);
+//   }
+//   xegpu::TensorDescType distributedDescType;
+//   if (originalT.isScattered()) {
+
+//     distributedDescType = xegpu::TensorDescType::get(
+//         distributedShape, originalT.getElementType(), originalT.getChunkSize(),
+//         originalT.getMemorySpace(), originalT.getSGMapAttr());
+//   } else {
+//     distributedDescType = xegpu::TensorDescType::get(
+//         distributedShape, originalT.getElementType(),
+//         originalT.getBoundaryCheck(), originalT.getArrayLength(),
+//         originalT.getMemorySpace(), originalT.getSGMapAttr());
+//   }
+//   return distributedDescType;
+// }
+} // namespace
+
+// LogicalResult
+// WarpOpStoreNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+//                                PatternRewriter &rewriter) const {
+//   auto yield = cast<vector::YieldOp>(
+//       warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+//   Operation *lastNode = yield->getPrevNode();
+//   auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+//   if (!storeOp)
+//     return failure();
+
+//   auto origType = storeOp.getTensorDescType();
+//   xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+//   if (!sgMap)
+//     return rewriter.notifyMatchFailure(
+//         storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+//   if (storeOp.getTensorDescType().getShape().size() != 2)
+//     return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+//   DBGS() << "Matched store_nd: " << storeOp << "\n";
+
+//   auto distributedTypeOrFailure =
+//       getDistributedVectorType(storeOp.getValueType(), sgMap);
+//   if (failed(distributedTypeOrFailure))
+//     return rewriter.notifyMatchFailure(storeOp,
+//                                        "Failed to distribute the type");
+//   VectorType newVectorType = distributedTypeOrFailure.value();
+
+//   auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+//       storeOp.getTensorDescType(), sgMap,
+//       storeOp.getTensorDescType().getMemorySpace());
+//   if (failed(distributedDescTypeOrFailure))
+//     return rewriter.notifyMatchFailure(storeOp,
+//                                        "Failed to distribute the desc type");
+//   xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+//   SmallVector<size_t> newRetIndices;
+//   vector::WarpExecuteOnLane0Op newWarpOp =
+//       moveRegionToNewWarpOpAndAppendReturns(
+//           rewriter, warpOp,
+//           ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+//           TypeRange{newTDescType, newVectorType}, newRetIndices);
+
+//   rewriter.setInsertionPointAfter(newWarpOp);
+//   auto newStoreOp =
+//       cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+//   rewriter.eraseOp(storeOp);
+//   newStoreOp.getTensorDescMutable().assign(
+//       newWarpOp.getResult(newRetIndices[0]));
+//   newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+
+//   return success();
+// }
+
+// LogicalResult WarpOpLoadNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+//                                             PatternRewriter &rewriter) const {
+//   OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+//     return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
+//   });
+
+//   if (!operand)
+//     return rewriter.notifyMatchFailure(warpOp,
+//                                        "warp result is not a xegpu::LoadNd op");
+
+//   auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+
+//   if (loadOp.getPacked())
+//     return rewriter.notifyMatchFailure(
+//         loadOp, "Packed load distribution not supported");
+
+//   xegpu::TensorDescType origType = loadOp.getTensorDescType();
+//   xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+//   if (!sgMap)
+//     return rewriter.notifyMatchFailure(
+//         loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+//   auto origShape = origType.getShape();
+//   if (origShape.size() != 2)
+//     return rewriter.notifyMatchFailure(loadOp, "unsupported shape");
+
+//   auto distributedTypeOrFailure =
+//       getDistributedVectorType(loadOp.getType(), sgMap);
+//   if (failed(distributedTypeOrFailure))
+//     return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
+//   VectorType newVectorType = distributedTypeOrFailure.value();
+
+//   auto distributedDescTypeOrFailure =
+//       getDistributedTensorDescType(loadOp.getTensorDescType(), sgMap,
+//                                    loadOp.getTensorDescType().getMemorySpace());
+//   if (failed(distributedDescTypeOrFailure))
+//     return rewriter.notifyMatchFailure(loadOp,
+//                                        "Failed to distribute the desc type");
+//   xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+//   unsigned operandIdx = operand->getOperandNumber();
+
+//   SmallVector<size_t> newRetIndices;
+//   vector::WarpExecuteOnLane0Op newWarpOp =
+//       moveRegionToNewWarpOpAndAppendReturns(
+//           rewriter, warpOp, loadOp.getTensorDesc(), TypeRange{newTDescType},
+//           newRetIndices);
+
+//   rewriter.setInsertionPointAfter(newWarpOp);
+
+//   auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+//       loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
+//       loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
+//       loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+
+//   newLoadOp.getTensorDescMutable().assign(
+//       newWarpOp.getResult(newRetIndices[0]));
+//   Value distributedVal = newWarpOp.getResult(operandIdx);
+//   rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+
+//   return success();
+// }
+
+// LogicalResult
+// WarpOpTensorDescOp::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+//                                     PatternRewriter &rewriter) const {
+//   OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+//     return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
+//   });
+
+//   if (!operand)
+//     return rewriter.notifyMatchFailure(
+//         warpOp, "warp result is not a xegpu::CreateNdDesc op");
+//   auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+//   assert(descOp && "desc op must be not null");
+//   unsigned operandIdx = operand->getOperandNumber();
+
+//   // TODO: is memref uniform in the region
+//   rewriter.setInsertionPoint(warpOp);
+//   auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+//   assert(srcTypedVal && "source value must be not null");
+
+//   auto descOffsets = descOp.getMixedOffsets();
+//   if (descOffsets.size() != 2)
+//     return rewriter.notifyMatchFailure(descOp,
+//                                        "offsets size is expected to be 2");
+
+//   xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+//   if (!sgMap)
+//     return rewriter.notifyMatchFailure(
+//         descOp, "the tensor descriptor lacks sg_map attribute");
+
+//   auto layout = sgMap.getWiLayout();
+
+//   // Calculate the offset within tensor descriptor for the current lane_id. The
+//   // access to proper element for a work item is done through a lane-specific
+//   // subview (tdesc offsets are used as base, lane shift is added on top).
+//   auto laneid = warpOp.getLaneid();
+//   auto xDim =
+//       rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
+//   auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
+//   auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
+
+//   auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+//                                                descOffsets[0]);
+//   auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+//                                                descOffsets[1]);
+//   auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
+//   auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
+
+//   auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+//       descOp.getType(), sgMap, descOp.getType().getMemorySpace());
+//   if (failed(distributedDescTypeOrFailure))
+//     return rewriter.notifyMatchFailure(descOp,
+//                                        "Failed to distribute the desc type");
+//   xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+//   auto distributedShape = newTDescType.getShape();
+//   // use the base memref strides
+//   SmallVector<OpFoldResult> overwriteStrides =
+//       getAsIndexOpFoldResult(rewriter.getContext(), SmallVector<int64_t>{1, 1});
+//   SmallVector<OpFoldResult> overwriteSizes =
+//       getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
+
+//   SmallVector<size_t> newRetIndices;
+//   vector::WarpExecuteOnLane0Op newWarpOp =
+//       moveRegionToNewWarpOpAndAppendReturns(
+//           rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
+//           newRetIndices);
+
+//   rewriter.setInsertionPointAfter(newWarpOp);
+//   auto subview = rewriter.create<memref::SubViewOp>(
+//       newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
+//       overwriteSizes, overwriteStrides);
+//   subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
+
+//   auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
+//   auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+//       newWarpOp.getLoc(), newTDescType, subview,
+//       getAsOpFoldResult({zero, zero}));
+
+//   Value distributedVal = newWarpOp.getResult(operandIdx);
+//   rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+
+//   return success();
+// }
+
+void xegpu::populateXeGPUDistributePatterns(RewritePatternSet &patterns) {
+  // patterns.add<WarpOpTensorDescOp>(patterns.getContext());
+  // patterns.add<WarpOpStoreNd>(patterns.getContext());
+  // patterns.add<WarpOpLoadNd>(patterns.getContext());
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
new file mode 100644
index 000000000000000..f9efda80ab64684
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
@@ -0,0 +1,81 @@
+// RUN: mlir-opt -test-xegpu-distribute -split-input-file %s | FileCheck %s
+
+#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
+
+// CHECK-LABEL: test_store_nd_distribution
+// CHECK: %[[laneid:.*]] = gpu.lane_id
+// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}}, %{{.*}} :  vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x2xf16>)
+// CHECK: ^bb0(%[[src:.*]]: vector<24x32xf16>, %[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK: gpu.yield%[[dst]], %[[src]] : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x32xf16>
+// CHECK: xegpu.store_nd %[[res]]#1, %[[res]]#0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> :
+// CHECK-SAME: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+
+func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> () {
+  %laneid = gpu.lane_id
+  gpu.warp_execute_on_lane_0(%laneid)[16]
+        args(%src, %dst: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+    ^bb0(%arg0: vector<24x32xf16>, %arg1: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>):
+    xegpu.store_nd %arg0, %arg1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+  }
+  return
+}
+
+// -----
+
+#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
+
+// CHECK-LABEL: test_load_nd_distribution
+// CHECK: %[[laneid:.*]] = gpu.lane_id
+// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}} :  !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK-SAME: -> (vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK: ^bb0(%[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK: %[[dead:.*]] = xegpu.load_nd
+// CHECK: gpu.yield%[[dead]], %[[dst]] : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: %[[load:.*]] = xegpu.load_nd %[[res]]#1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
+// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<24x2xf16>
+// CHECK: return %[[load]]
+
+func.func @test_load_nd_distribution(%dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> (vector<24x2xf16>) {
+  %laneid = gpu.lane_id
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16]
+        args(%dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> (vector<24x2xf16>) {
+    ^bb0(%arg0: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>):
+    %0 = xegpu.load_nd %arg0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+       : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16> -> vector<24x32xf16>
+    gpu.yield%0 : vector<24x32xf16>
+  }
+  return %r : vector<24x2xf16>
+}
+
+// -----
+
+#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
+
+// CHECK-LABEL: test_create_nd_desc_distribution
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[laneid:.*]] = gpu.lane_id
+// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}} : memref<24x32xf16>)
+// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>)
+// CHECK: ^bb0(%[[dst:.*]]: memref<24x32xf16>)
+// CHECK: %[[dead:.*]] = xegpu.create_nd_tdesc
+// CHECK: gpu.yield%[[dead]], %[[dst]] :
+// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>
+// CHECK: %[[view:.*]] = memref.subview %[[res]]#1[%[[C0]], %[[laneid]]] [24, 2] [1, 1] : memref<24x32xf16> to memref<24x2xf16, strided<[32, 1], offset: ?>>
+// CHECK: %[[desc:.*]] = xegpu.create_nd_tdesc %[[view]][0, 0] : memref<24x2xf16, strided<[32, 1], offset: ?>>
+// CHECK-SAME: -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: return %[[desc]]
+
+func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+  %laneid = gpu.lane_id
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16]
+        args(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+    ^bb0(%arg0: memref<24x32xf16>):
+    %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+    gpu.yield%0 : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+  }
+  return %r : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+}
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt
index 29fb4441a24fd28..a8fd70e6397a524 100644
--- a/mlir/test/lib/Dialect/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/CMakeLists.txt
@@ -22,3 +22,4 @@ add_subdirectory(TestDyn)
 add_subdirectory(Tosa)
 add_subdirectory(Transform)
 add_subdirectory(Vector)
+add_subdirectory(XeGPU)
diff --git a/mlir/test/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/test/lib/Dialect/XeGPU/CMakeLists.txt
new file mode 100644
index 000000000000000..d67439a8b6127e8
--- /dev/null
+++ b/mlir/test/lib/Dialect/XeGPU/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Exclude tests from libMLIR.so
+add_mlir_library(MLIRXeGPUTestPasses
+  TestXeGPUTransforms.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+  MLIRXeGPUTransforms
+  MLIRXeGPUDialect
+  MLIRSupport
+  )
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
new file mode 100644
index 000000000000000..a5060c8eeb916ea
--- /dev/null
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -0,0 +1,58 @@
+//===- TestXeGPUTransforms.cpp - Test XeGPU transforms and lowerings ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::xegpu;
+using namespace mlir::vector;
+
+namespace {
+struct TestXeGPUDistribution
+    : public PassWrapper<TestXeGPUDistribution, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUDistribution)
+
+  TestXeGPUDistribution() = default;
+  TestXeGPUDistribution(const TestXeGPUDistribution &pass)
+      : PassWrapper(pass) {}
+
+  StringRef getArgument() const final { return "test-xegpu-distribute"; }
+  StringRef getDescription() const final {
+    return "Test patterns for operations work item distribution";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateXeGPUDistributePatterns(patterns);
+    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestXeGPUTransforms() {
+  PassRegistration<TestXeGPUDistribution>();
+}
+} // namespace test
+} // namespace mlir
\ No newline at end of file
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 8b79de58fa1028b..e4ffbbee7a1d947 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -47,6 +47,7 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTilingInterfaceTestPasses
     MLIRVectorTestPasses
     MLIRTestVectorToSPIRV
+    MLIRXeGPUTestPasses
     MLIRLLVMTestPasses
     )
   set(test_libs ${test_libs}
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 960f7037a1b61fe..c89ff9964efa03c 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -153,6 +153,7 @@ void registerTestTransformDialectEraseSchedulePass();
 void registerTestPassStateExtensionCommunication();
 void registerTestVectorLowerings();
 void registerTestVectorReductionToSPIRVDotProd();
+void registerTestXeGPUTransforms();
 void registerTestVulkanRunnerPipeline();
 void registerTestWrittenToPass();
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
@@ -291,6 +292,7 @@ void registerTestPasses() {
   mlir::test::registerTestTransformDialectEraseSchedulePass();
   mlir::test::registerTestPassStateExtensionCommunication();
   mlir::test::registerTestVectorLowerings();
+  mlir::test::registerTestXeGPUTransforms();
   mlir::test::registerTestVectorReductionToSPIRVDotProd();
   mlir::test::registerTestVulkanRunnerPipeline();
   mlir::test::registerTestWrittenToPass();

>From 1c0692085ae2518768132e53e6436bf443e5e0a4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 19:47:16 +0000
Subject: [PATCH 03/15] fix

---
 .../XeGPU/Transforms/XeGPUDistribute.cpp      | 231 +++++++++---------
 1 file changed, 116 insertions(+), 115 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 52c953697d0f346..5c45a293a8d4689 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
@@ -66,12 +67,12 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 // ///                                 -> !xegpu.tensor_desc<4x1xf32>
 // ///
 // /// ```
-// struct WarpOpTensorDescOp final
-//     : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-//   using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-//   LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-//                                 PatternRewriter &rewriter) const override;
-// };
+struct WarpOpTensorDescOp final
+    : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override;
+};
 
 // /// Sink a store_nd feeding into vector.yield op for the enclosing
 // /// `vector.warp_execute_on_lane_0`. In case arguments for the store are passed
@@ -159,32 +160,32 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 //   return newVectorType;
 // }
 
-// FailureOr<xegpu::TensorDescType>
-// getDistributedTensorDescType(xegpu::TensorDescType originalT,
-//                              xegpu::SGMapAttr sgMap,
-//                              xegpu::MemorySpace memSpace) {
-//   llvm::SmallVector<int64_t, 2> distributedShape;
-//   auto layout = sgMap.getWiLayout();
-//   auto shape = originalT.getShape();
-//   for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
-//     if (!divisible(APInt(64, o), APInt(64, l)))
-//       return failure();
-//     distributedShape.push_back(o / l);
-//   }
-//   xegpu::TensorDescType distributedDescType;
-//   if (originalT.isScattered()) {
-
-//     distributedDescType = xegpu::TensorDescType::get(
-//         distributedShape, originalT.getElementType(), originalT.getChunkSize(),
-//         originalT.getMemorySpace(), originalT.getSGMapAttr());
-//   } else {
-//     distributedDescType = xegpu::TensorDescType::get(
-//         distributedShape, originalT.getElementType(),
-//         originalT.getBoundaryCheck(), originalT.getArrayLength(),
-//         originalT.getMemorySpace(), originalT.getSGMapAttr());
-//   }
-//   return distributedDescType;
-// }
+FailureOr<xegpu::TensorDescType>
+getDistributedTensorDescType(xegpu::TensorDescType originalT,
+                             xegpu::SGMapAttr sgMap,
+                             xegpu::MemorySpace memSpace) {
+  llvm::SmallVector<int64_t, 2> distributedShape;
+  auto layout = sgMap.getWiLayout();
+  auto shape = originalT.getShape();
+  for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+    if (!divisible(APInt(64, o), APInt(64, l)))
+      return failure();
+    distributedShape.push_back(o / l);
+  }
+  xegpu::TensorDescType distributedDescType;
+  if (originalT.isScattered()) {
+
+    distributedDescType = xegpu::TensorDescType::get(
+        distributedShape, originalT.getElementType(), originalT.getChunkSize(),
+        originalT.getMemorySpace(), originalT.getSGMapAttr());
+  } else {
+    distributedDescType = xegpu::TensorDescType::get(
+        distributedShape, originalT.getElementType(),
+        originalT.getBoundaryCheck(), originalT.getArrayLength(),
+        originalT.getMemorySpace(), originalT.getSGMapAttr());
+  }
+  return distributedDescType;
+}
 } // namespace
 
 // LogicalResult
@@ -303,91 +304,91 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 //   return success();
 // }
 
-// LogicalResult
-// WarpOpTensorDescOp::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-//                                     PatternRewriter &rewriter) const {
-//   OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
-//     return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
-//   });
-
-//   if (!operand)
-//     return rewriter.notifyMatchFailure(
-//         warpOp, "warp result is not a xegpu::CreateNdDesc op");
-//   auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
-//   assert(descOp && "desc op must be not null");
-//   unsigned operandIdx = operand->getOperandNumber();
-
-//   // TODO: is memref uniform in the region
-//   rewriter.setInsertionPoint(warpOp);
-//   auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
-//   assert(srcTypedVal && "source value must be not null");
-
-//   auto descOffsets = descOp.getMixedOffsets();
-//   if (descOffsets.size() != 2)
-//     return rewriter.notifyMatchFailure(descOp,
-//                                        "offsets size is expected to be 2");
-
-//   xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
-//   if (!sgMap)
-//     return rewriter.notifyMatchFailure(
-//         descOp, "the tensor descriptor lacks sg_map attribute");
-
-//   auto layout = sgMap.getWiLayout();
-
-//   // Calculate the offset within tensor descriptor for the current lane_id. The
-//   // access to proper element for a work item is done through a lane-specific
-//   // subview (tdesc offsets are used as base, lane shift is added on top).
-//   auto laneid = warpOp.getLaneid();
-//   auto xDim =
-//       rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
-//   auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
-//   auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
-
-//   auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
-//                                                descOffsets[0]);
-//   auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
-//                                                descOffsets[1]);
-//   auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
-//   auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
-
-//   auto distributedDescTypeOrFailure = getDistributedTensorDescType(
-//       descOp.getType(), sgMap, descOp.getType().getMemorySpace());
-//   if (failed(distributedDescTypeOrFailure))
-//     return rewriter.notifyMatchFailure(descOp,
-//                                        "Failed to distribute the desc type");
-//   xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
-//   auto distributedShape = newTDescType.getShape();
-//   // use the base memref strides
-//   SmallVector<OpFoldResult> overwriteStrides =
-//       getAsIndexOpFoldResult(rewriter.getContext(), SmallVector<int64_t>{1, 1});
-//   SmallVector<OpFoldResult> overwriteSizes =
-//       getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
-
-//   SmallVector<size_t> newRetIndices;
-//   vector::WarpExecuteOnLane0Op newWarpOp =
-//       moveRegionToNewWarpOpAndAppendReturns(
-//           rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
-//           newRetIndices);
-
-//   rewriter.setInsertionPointAfter(newWarpOp);
-//   auto subview = rewriter.create<memref::SubViewOp>(
-//       newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
-//       overwriteSizes, overwriteStrides);
-//   subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
-
-//   auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
-//   auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-//       newWarpOp.getLoc(), newTDescType, subview,
-//       getAsOpFoldResult({zero, zero}));
-
-//   Value distributedVal = newWarpOp.getResult(operandIdx);
-//   rewriter.replaceAllUsesWith(distributedVal, newDescOp);
-
-//   return success();
-// }
+LogicalResult
+WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                    PatternRewriter &rewriter) const {
+  OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+    return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
+  });
+
+  if (!operand)
+    return rewriter.notifyMatchFailure(
+        warpOp, "warp result is not a xegpu::CreateNdDesc op");
+  auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+  assert(descOp && "desc op must be not null");
+  unsigned operandIdx = operand->getOperandNumber();
+
+  // TODO: is memref uniform in the region
+  rewriter.setInsertionPoint(warpOp);
+  auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+  assert(srcTypedVal && "source value must be not null");
+
+  auto descOffsets = descOp.getMixedOffsets();
+  if (descOffsets.size() != 2)
+    return rewriter.notifyMatchFailure(descOp,
+                                       "offsets size is expected to be 2");
+
+  xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+  if (!sgMap)
+    return rewriter.notifyMatchFailure(
+        descOp, "the tensor descriptor lacks sg_map attribute");
+
+  auto layout = sgMap.getWiLayout();
+
+  // Calculate the offset within tensor descriptor for the current lane_id. The
+  // access to proper element for a work item is done through a lane-specific
+  // subview (tdesc offsets are used as base, lane shift is added on top).
+  auto laneid = warpOp.getLaneid();
+  auto xDim =
+      rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
+  auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
+  auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
+
+  auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+                                               descOffsets[0]);
+  auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+                                               descOffsets[1]);
+  auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
+  auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
+
+  auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+      descOp.getType(), sgMap, descOp.getType().getMemorySpace());
+  if (failed(distributedDescTypeOrFailure))
+    return rewriter.notifyMatchFailure(descOp,
+                                       "Failed to distribute the desc type");
+  xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+  auto distributedShape = newTDescType.getShape();
+  // use the base memref strides
+  SmallVector<OpFoldResult> overwriteStrides =
+      getAsIndexOpFoldResult(rewriter.getContext(), SmallVector<int64_t>{1, 1});
+  SmallVector<OpFoldResult> overwriteSizes =
+      getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
+
+  SmallVector<size_t> newRetIndices;
+  gpu::WarpExecuteOnLane0Op newWarpOp =
+      moveRegionToNewWarpOpAndAppendReturns(
+          rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
+          newRetIndices);
+
+  rewriter.setInsertionPointAfter(newWarpOp);
+  auto subview = rewriter.create<memref::SubViewOp>(
+      newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
+      overwriteSizes, overwriteStrides);
+  subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
+
+  auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
+  auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+      newWarpOp.getLoc(), newTDescType, subview,
+      getAsOpFoldResult({zero, zero}));
+
+  Value distributedVal = newWarpOp.getResult(operandIdx);
+  rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+
+  return success();
+}
 
 void xegpu::populateXeGPUDistributePatterns(RewritePatternSet &patterns) {
-  // patterns.add<WarpOpTensorDescOp>(patterns.getContext());
+  patterns.add<WarpOpTensorDescOp>(patterns.getContext());
   // patterns.add<WarpOpStoreNd>(patterns.getContext());
   // patterns.add<WarpOpLoadNd>(patterns.getContext());
 }

>From 9888c849327aa0caef34325c0a8333851503714d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 21:15:39 +0000
Subject: [PATCH 04/15] fix

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td        | 3 +--
 mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5910aa3f7f2dae5..f3ffbd0f5a027d9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -327,8 +327,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>,
-                                       AllElementTypesMatch<["value", "TensorDesc"]>]> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
   let description = [{
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 5c45a293a8d4689..ac02be782802182 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -170,7 +170,11 @@ getDistributedTensorDescType(xegpu::TensorDescType originalT,
   for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
     if (!divisible(APInt(64, o), APInt(64, l)))
       return failure();
-    distributedShape.push_back(o / l);
+    // Tensor descriptor is distributed only for the scattered case.
+    if (originalT.isScattered())
+      distributedShape.push_back(o / l);
+    else
+      distributedShape.push_back(o);
   }
   xegpu::TensorDescType distributedDescType;
   if (originalT.isScattered()) {

>From 491625d81d80cb0092ec5704f0de23c039e9c681 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 21:34:10 +0000
Subject: [PATCH 05/15] fix

---
 .../XeGPU/Transforms/XeGPUDistribute.cpp      | 319 +++++++++---------
 1 file changed, 161 insertions(+), 158 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index ac02be782802182..95610a9af7ac85c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -33,11 +33,15 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 
 // /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
 // /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by the
-// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// The warp op will still contain the original op that will not be used by
+// the
+// /// yield op (and should be cleaned up later with dce). The yield op will
+// bypass
 // /// the create_nd_tdesc's arguments.
-// /// The rewrite will create a subview of the size used by a single work item and
-// /// appropriate offset. The distributed create_nd_tdesc points into the subview
+// /// The rewrite will create a subview of the size used by a single work item
+// and
+// /// appropriate offset. The distributed create_nd_tdesc points into the
+// subview
 // /// without offset. The tensor descriptor types is distributed according to
 // /// sg_map attribute.
 // ///
@@ -75,8 +79,10 @@ struct WarpOpTensorDescOp final
 };
 
 // /// Sink a store_nd feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are passed
-// /// through the warp op interface they would be propagated as returned values.
+// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are
+// passed
+// /// through the warp op interface they would be propagated as returned
+// values.
 // /// Both the stored vector type and tensor descriptor types are distributed
 // /// according to sg_map attribute.
 // ///
@@ -97,20 +103,23 @@ struct WarpOpTensorDescOp final
 // ///     ...
 // ///     vector.yield
 // ///   }
-// ///   xegpu.store_nd %arg0, %arg1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
+// ///   xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
+// !xegpu.tensor_desc<4x1xf32>
 // ///
 // /// ```
-// struct WarpOpStoreNd final
-//     : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-//   using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-//   LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-//                                 PatternRewriter &rewriter) const override;
-// };
+struct WarpOpStoreNd final
+    : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override;
+};
 
 // /// Clone a load_nd feeding into vector.yield op for the enclosing
 // /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by the
-// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// The warp op will still contain the original op that will not be used by
+// the
+// /// yield op (and should be cleaned up later with dce). The yield op will
+// bypass
 // /// the load's arguments.
 // /// Both the loaded vector type and tensor descriptor types are distributed
 // /// according to sg_map attribute.
@@ -137,28 +146,27 @@ struct WarpOpTensorDescOp final
 // ///   xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
 // ///
 // /// ```
-// struct WarpOpLoadNd final
-//     : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-//   using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-//   LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-//                                 PatternRewriter &rewriter) const override;
-// };
-
-// FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
-//                                                xegpu::SGMapAttr sgMap) {
-//   llvm::SmallVector<int64_t, 2> distributedShape;
-//   auto layout = sgMap.getWiLayout();
-//   auto shape = originalT.getShape();
-//   for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
-//     if (!divisible(APInt(64, o), APInt(64, l)))
-//       return failure();
-//     distributedShape.push_back(o / l);
-//   }
-//   auto newVectorType =
-//       VectorType::get(distributedShape, originalT.getElementType(),
-//                       originalT.getScalableDims());
-//   return newVectorType;
-// }
+struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
+                                               xegpu::SGMapAttr sgMap) {
+  llvm::SmallVector<int64_t, 2> distributedShape;
+  auto layout = sgMap.getWiLayout();
+  auto shape = originalT.getShape();
+  for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+    if (!divisible(APInt(64, o), APInt(64, l)))
+      return failure();
+    distributedShape.push_back(o / l);
+  }
+  auto newVectorType =
+      VectorType::get(distributedShape, originalT.getElementType(),
+                      originalT.getScalableDims());
+  return newVectorType;
+}
 
 FailureOr<xegpu::TensorDescType>
 getDistributedTensorDescType(xegpu::TensorDescType originalT,
@@ -192,121 +200,117 @@ getDistributedTensorDescType(xegpu::TensorDescType originalT,
 }
 } // namespace
 
-// LogicalResult
-// WarpOpStoreNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-//                                PatternRewriter &rewriter) const {
-//   auto yield = cast<vector::YieldOp>(
-//       warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
-//   Operation *lastNode = yield->getPrevNode();
-//   auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
-//   if (!storeOp)
-//     return failure();
-
-//   auto origType = storeOp.getTensorDescType();
-//   xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
-//   if (!sgMap)
-//     return rewriter.notifyMatchFailure(
-//         storeOp, "the source tensor descriptor lacks sg_map attribute");
-
-//   if (storeOp.getTensorDescType().getShape().size() != 2)
-//     return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-//   DBGS() << "Matched store_nd: " << storeOp << "\n";
-
-//   auto distributedTypeOrFailure =
-//       getDistributedVectorType(storeOp.getValueType(), sgMap);
-//   if (failed(distributedTypeOrFailure))
-//     return rewriter.notifyMatchFailure(storeOp,
-//                                        "Failed to distribute the type");
-//   VectorType newVectorType = distributedTypeOrFailure.value();
-
-//   auto distributedDescTypeOrFailure = getDistributedTensorDescType(
-//       storeOp.getTensorDescType(), sgMap,
-//       storeOp.getTensorDescType().getMemorySpace());
-//   if (failed(distributedDescTypeOrFailure))
-//     return rewriter.notifyMatchFailure(storeOp,
-//                                        "Failed to distribute the desc type");
-//   xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
-
-//   SmallVector<size_t> newRetIndices;
-//   vector::WarpExecuteOnLane0Op newWarpOp =
-//       moveRegionToNewWarpOpAndAppendReturns(
-//           rewriter, warpOp,
-//           ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
-//           TypeRange{newTDescType, newVectorType}, newRetIndices);
-
-//   rewriter.setInsertionPointAfter(newWarpOp);
-//   auto newStoreOp =
-//       cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
-//   rewriter.eraseOp(storeOp);
-//   newStoreOp.getTensorDescMutable().assign(
-//       newWarpOp.getResult(newRetIndices[0]));
-//   newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
-
-//   return success();
-// }
-
-// LogicalResult WarpOpLoadNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-//                                             PatternRewriter &rewriter) const {
-//   OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
-//     return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
-//   });
-
-//   if (!operand)
-//     return rewriter.notifyMatchFailure(warpOp,
-//                                        "warp result is not a xegpu::LoadNd op");
-
-//   auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
-
-//   if (loadOp.getPacked())
-//     return rewriter.notifyMatchFailure(
-//         loadOp, "Packed load distribution not supported");
-
-//   xegpu::TensorDescType origType = loadOp.getTensorDescType();
-//   xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
-//   if (!sgMap)
-//     return rewriter.notifyMatchFailure(
-//         loadOp, "the source tensor descriptor lacks sg_map attribute");
-
-//   auto origShape = origType.getShape();
-//   if (origShape.size() != 2)
-//     return rewriter.notifyMatchFailure(loadOp, "unsupported shape");
-
-//   auto distributedTypeOrFailure =
-//       getDistributedVectorType(loadOp.getType(), sgMap);
-//   if (failed(distributedTypeOrFailure))
-//     return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
-//   VectorType newVectorType = distributedTypeOrFailure.value();
-
-//   auto distributedDescTypeOrFailure =
-//       getDistributedTensorDescType(loadOp.getTensorDescType(), sgMap,
-//                                    loadOp.getTensorDescType().getMemorySpace());
-//   if (failed(distributedDescTypeOrFailure))
-//     return rewriter.notifyMatchFailure(loadOp,
-//                                        "Failed to distribute the desc type");
-//   xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
-
-//   unsigned operandIdx = operand->getOperandNumber();
-
-//   SmallVector<size_t> newRetIndices;
-//   vector::WarpExecuteOnLane0Op newWarpOp =
-//       moveRegionToNewWarpOpAndAppendReturns(
-//           rewriter, warpOp, loadOp.getTensorDesc(), TypeRange{newTDescType},
-//           newRetIndices);
-
-//   rewriter.setInsertionPointAfter(newWarpOp);
-
-//   auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-//       loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
-//       loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
-//       loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
-
-//   newLoadOp.getTensorDescMutable().assign(
-//       newWarpOp.getResult(newRetIndices[0]));
-//   Value distributedVal = newWarpOp.getResult(operandIdx);
-//   rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
-
-//   return success();
-// }
+LogicalResult WarpOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                             PatternRewriter &rewriter) const {
+  auto yield = cast<gpu::YieldOp>(
+      warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+  Operation *lastNode = yield->getPrevNode();
+  auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+  if (!storeOp)
+    return failure();
+
+  auto origType = storeOp.getTensorDescType();
+  xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+  if (!sgMap)
+    return rewriter.notifyMatchFailure(
+        storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+  if (storeOp.getTensorDescType().getShape().size() != 2)
+    return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+  DBGS() << "Matched store_nd: " << storeOp << "\n";
+
+  auto distributedTypeOrFailure =
+      getDistributedVectorType(storeOp.getValueType(), sgMap);
+  if (failed(distributedTypeOrFailure))
+    return rewriter.notifyMatchFailure(storeOp,
+                                       "Failed to distribute the type");
+  VectorType newVectorType = distributedTypeOrFailure.value();
+
+  auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+      storeOp.getTensorDescType(), sgMap,
+      storeOp.getTensorDescType().getMemorySpace());
+  if (failed(distributedDescTypeOrFailure))
+    return rewriter.notifyMatchFailure(storeOp,
+                                       "Failed to distribute the desc type");
+  xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+  SmallVector<size_t> newRetIndices;
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, warpOp, ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+      TypeRange{newTDescType, newVectorType}, newRetIndices);
+
+  rewriter.setInsertionPointAfter(newWarpOp);
+  auto newStoreOp =
+      cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+  rewriter.eraseOp(storeOp);
+  newStoreOp.getTensorDescMutable().assign(
+      newWarpOp.getResult(newRetIndices[0]));
+  newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+
+  return success();
+}
+
+LogicalResult WarpOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                            PatternRewriter &rewriter) const {
+  OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+    return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
+  });
+
+  if (!operand)
+    return rewriter.notifyMatchFailure(warpOp,
+                                       "warp result is not a xegpu::LoadNd op");
+
+  auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+
+  if (loadOp.getPacked())
+    return rewriter.notifyMatchFailure(
+        loadOp, "Packed load distribution not supported");
+
+  xegpu::TensorDescType origType = loadOp.getTensorDescType();
+  xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+  if (!sgMap)
+    return rewriter.notifyMatchFailure(
+        loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+  auto origShape = origType.getShape();
+  if (origShape.size() != 2)
+    return rewriter.notifyMatchFailure(loadOp, "unsupported shape");
+
+  auto distributedTypeOrFailure =
+      getDistributedVectorType(loadOp.getType(), sgMap);
+  if (failed(distributedTypeOrFailure))
+    return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
+  VectorType newVectorType = distributedTypeOrFailure.value();
+
+  auto distributedDescTypeOrFailure =
+      getDistributedTensorDescType(loadOp.getTensorDescType(), sgMap,
+                                   loadOp.getTensorDescType().getMemorySpace());
+  if (failed(distributedDescTypeOrFailure))
+    return rewriter.notifyMatchFailure(loadOp,
+                                       "Failed to distribute the desc type");
+  xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+  unsigned operandIdx = operand->getOperandNumber();
+
+  SmallVector<size_t> newRetIndices;
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, warpOp, loadOp.getTensorDesc(), TypeRange{newTDescType},
+      newRetIndices);
+
+  rewriter.setInsertionPointAfter(newWarpOp);
+
+  auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+      loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
+      loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
+      loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+
+  newLoadOp.getTensorDescMutable().assign(
+      newWarpOp.getResult(newRetIndices[0]));
+  Value distributedVal = newWarpOp.getResult(operandIdx);
+  rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+
+  return success();
+}
 
 LogicalResult
 WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -369,10 +373,9 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
       getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
 
   SmallVector<size_t> newRetIndices;
-  gpu::WarpExecuteOnLane0Op newWarpOp =
-      moveRegionToNewWarpOpAndAppendReturns(
-          rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
-          newRetIndices);
+  gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+      rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
+      newRetIndices);
 
   rewriter.setInsertionPointAfter(newWarpOp);
   auto subview = rewriter.create<memref::SubViewOp>(
@@ -393,6 +396,6 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
 
 void xegpu::populateXeGPUDistributePatterns(RewritePatternSet &patterns) {
   patterns.add<WarpOpTensorDescOp>(patterns.getContext());
-  // patterns.add<WarpOpStoreNd>(patterns.getContext());
-  // patterns.add<WarpOpLoadNd>(patterns.getContext());
+  patterns.add<WarpOpStoreNd>(patterns.getContext());
+  patterns.add<WarpOpLoadNd>(patterns.getContext());
 }

>From 07f9f9f77953c4278912afd12910f2264f3e36e3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 22:37:28 +0000
Subject: [PATCH 06/15] fix

---
 .../XeGPU/Transforms/XeGPUDistribute.cpp      | 29 ++----------------
 mlir/test/Dialect/XeGPU/xegpu-distribute.mlir | 30 ++++++++-----------
 2 files changed, 16 insertions(+), 43 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 95610a9af7ac85c..297cff8d0144d57 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -341,24 +341,6 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
     return rewriter.notifyMatchFailure(
         descOp, "the tensor descriptor lacks sg_map attribute");
 
-  auto layout = sgMap.getWiLayout();
-
-  // Calculate the offset within tensor descriptor for the current lane_id. The
-  // access to proper element for a work item is done through a lane-specific
-  // subview (tdesc offsets are used as base, lane shift is added on top).
-  auto laneid = warpOp.getLaneid();
-  auto xDim =
-      rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
-  auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
-  auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
-
-  auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
-                                               descOffsets[0]);
-  auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
-                                               descOffsets[1]);
-  auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
-  auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
-
   auto distributedDescTypeOrFailure = getDistributedTensorDescType(
       descOp.getType(), sgMap, descOp.getType().getMemorySpace());
   if (failed(distributedDescTypeOrFailure))
@@ -378,15 +360,10 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
       newRetIndices);
 
   rewriter.setInsertionPointAfter(newWarpOp);
-  auto subview = rewriter.create<memref::SubViewOp>(
-      newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
-      overwriteSizes, overwriteStrides);
-  subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
-
-  auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
   auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-      newWarpOp.getLoc(), newTDescType, subview,
-      getAsOpFoldResult({zero, zero}));
+      newWarpOp.getLoc(), newTDescType,
+      dyn_cast<TypedValue<MemRefType>>(newWarpOp.getResult(newRetIndices[0])),
+      descOffsets);
 
   Value distributedVal = newWarpOp.getResult(operandIdx);
   rewriter.replaceAllUsesWith(distributedVal, newDescOp);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
index f9efda80ab64684..755f4922bfaa236 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
@@ -8,9 +8,9 @@
 // CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}}, %{{.*}} :  vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
 // CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x2xf16>)
 // CHECK: ^bb0(%[[src:.*]]: vector<24x32xf16>, %[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
-// CHECK: gpu.yield%[[dst]], %[[src]] : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x32xf16>
+// CHECK: gpu.yield %[[dst]], %[[src]] : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x32xf16>
 // CHECK: xegpu.store_nd %[[res]]#1, %[[res]]#0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> :
-// CHECK-SAME: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
 
 func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> () {
   %laneid = gpu.lane_id
@@ -23,7 +23,6 @@ func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tens
 }
 
 // -----
-
 #sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
 #blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
 
@@ -33,7 +32,7 @@ func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tens
 // CHECK-SAME: -> (vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
 // CHECK: ^bb0(%[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
 // CHECK: %[[dead:.*]] = xegpu.load_nd
-// CHECK: gpu.yield%[[dead]], %[[dst]] : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: gpu.yield %[[dead]], %[[dst]] : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
 // CHECK: %[[load:.*]] = xegpu.load_nd %[[res]]#1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
 // CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<24x2xf16>
 // CHECK: return %[[load]]
@@ -56,26 +55,23 @@ func.func @test_load_nd_distribution(%dst: !xegpu.tensor_desc<24x32xf16, #blk_td
 #blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
 
 // CHECK-LABEL: test_create_nd_desc_distribution
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
 // CHECK: %[[laneid:.*]] = gpu.lane_id
 // CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}} : memref<24x32xf16>)
-// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>)
+// CHECK-SAME: -> (!xegpu.tensor_desc<12x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>)
 // CHECK: ^bb0(%[[dst:.*]]: memref<24x32xf16>)
 // CHECK: %[[dead:.*]] = xegpu.create_nd_tdesc
-// CHECK: gpu.yield%[[dead]], %[[dst]] :
-// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>
-// CHECK: %[[view:.*]] = memref.subview %[[res]]#1[%[[C0]], %[[laneid]]] [24, 2] [1, 1] : memref<24x32xf16> to memref<24x2xf16, strided<[32, 1], offset: ?>>
-// CHECK: %[[desc:.*]] = xegpu.create_nd_tdesc %[[view]][0, 0] : memref<24x2xf16, strided<[32, 1], offset: ?>>
-// CHECK-SAME: -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-// CHECK: return %[[desc]]
+// CHECK: gpu.yield %[[dead]], %[[dst]] :
+
 
-func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>) {
   %laneid = gpu.lane_id
+  %c12 = arith.constant 12 : index
   %r = gpu.warp_execute_on_lane_0(%laneid)[16]
-        args(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+        args(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>) {
     ^bb0(%arg0: memref<24x32xf16>):
-    %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
-    gpu.yield%0 : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+    %0 = xegpu.create_nd_tdesc %arg0[%c12, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>
+    gpu.yield%0 : !xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>
   }
-  return %r : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+  return %r : !xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>
 }
\ No newline at end of file

>From b842f331ffab253aaf8924b5a6574d35c4fdcf9d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 22:45:02 +0000
Subject: [PATCH 07/15] fix

---
 .../XeGPU/Transforms/XeGPUDistribute.cpp      | 200 +++++++++---------
 1 file changed, 99 insertions(+), 101 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 297cff8d0144d57..e3f48919c392667 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -31,46 +31,46 @@ using namespace mlir;
 namespace {
 bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 
-// /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by
-// the
-// /// yield op (and should be cleaned up later with dce). The yield op will
-// bypass
-// /// the create_nd_tdesc's arguments.
-// /// The rewrite will create a subview of the size used by a single work item
-// and
-// /// appropriate offset. The distributed create_nd_tdesc points into the
-// subview
-// /// without offset. The tensor descriptor types is distributed according to
-// /// sg_map attribute.
-// ///
-// /// Example:
-// ///
-// /// ```
-// ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-// ///   %r = vector.warp_execute_on_lane_0(%laneid) ->
-// ///                   (!xegpu.tensor_desc<4x8xf32>) {
-// ///     ...
-// ///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
-// ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
-// ///     vector.yield %td
-// ///   }
-// /// ```
-// /// To
-// /// ```
-// ///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
-// ///     ...
-// ///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
-// ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
-// ///     vector.yield %arg0, %dead
-// ///   }
-// ///   %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
-// ///                               : memref<4x8xf32> to memref<4x1xf32>
-// ///   %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
-// ///                                 -> !xegpu.tensor_desc<4x1xf32>
-// ///
-// /// ```
+/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+/// `vector.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the
+/// yield op (and should be cleaned up later with dce). The yield op will
+/// bypass
+/// the create_nd_tdesc's arguments.
+/// The rewrite will create a subview of the size used by a single work item
+/// and
+/// appropriate offset. The distributed create_nd_tdesc points into the
+/// subview
+/// without offset. The tensor descriptor types is distributed according to
+/// sg_map attribute.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = vector.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32>) {
+///     ...
+///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+///     vector.yield %td
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+///     vector.yield %arg0, %dead
+///   }
+///   %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
+///                               : memref<4x8xf32> to memref<4x1xf32>
+///   %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
+///                                 -> !xegpu.tensor_desc<4x1xf32>
+///
+/// ```
 struct WarpOpTensorDescOp final
     : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
   using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
@@ -78,35 +78,35 @@ struct WarpOpTensorDescOp final
                                 PatternRewriter &rewriter) const override;
 };
 
-// /// Sink a store_nd feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are
-// passed
-// /// through the warp op interface they would be propagated as returned
-// values.
-// /// Both the stored vector type and tensor descriptor types are distributed
-// /// according to sg_map attribute.
-// ///
-// /// Example:
-// ///
-// /// ```
-// ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-// ///   vector.warp_execute_on_lane_0(%laneid) -> () {
-// ///     ...
-// ///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
-// ///                                 !xegpu.tensor_desc<4x8xf32>
-// ///     vector.yield
-// ///   }
-// /// ```
-// /// To
-// /// ```
-// ///   %r = vector.warp_execute_on_lane_0(%laneid) -> () {
-// ///     ...
-// ///     vector.yield
-// ///   }
-// ///   xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
-// !xegpu.tensor_desc<4x1xf32>
-// ///
-// /// ```
+/// Sink a store_nd feeding into vector.yield op for the enclosing
+/// `vector.warp_execute_on_lane_0`. In case arguments for the store are
+/// passed
+/// through the warp op interface they would be propagated as returned
+/// values.
+/// Both the stored vector type and tensor descriptor types are distributed
+/// according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   vector.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+///                                 !xegpu.tensor_desc<4x8xf32>
+///     vector.yield
+///   }
+/// ```
+/// To
+/// ```
+///   %r = vector.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     vector.yield
+///   }
+///   xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
+///   !xegpu.tensor_desc<4x1xf32>
+///
+/// ```
 struct WarpOpStoreNd final
     : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
   using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
@@ -114,38 +114,36 @@ struct WarpOpStoreNd final
                                 PatternRewriter &rewriter) const override;
 };
 
-// /// Clone a load_nd feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by
-// the
-// /// yield op (and should be cleaned up later with dce). The yield op will
-// bypass
-// /// the load's arguments.
-// /// Both the loaded vector type and tensor descriptor types are distributed
-// /// according to sg_map attribute.
-// ///
-// /// Example:
-// ///
-// /// ```
-// ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-// ///   %r = vector.warp_execute_on_lane_0(%laneid) ->
-// ///                   (!xegpu.tensor_desc<4x8xf32>) {
-// ///     ...
-// ///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
-// ///     vector<4x8xf32> vector.yield %ld
-// ///   }
-// /// ```
-// /// To
-// /// ```
-// ///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
-// ///     ...
-// ///     %dead = xegpu.load_nd %arg0, %arg1:
-// ///         !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
-// ///     vector.yield %arg0, %arg1
-// ///   }
-// ///   xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
-// ///
-// /// ```
+/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// `vector.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later with dce). The yield op will
+/// bypass the load's arguments. Both the loaded vector type and tensor
+/// descriptor types are distributed according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = vector.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32>) {
+///     ...
+///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
+///     vector<4x8xf32> vector.yield %ld
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     %dead = xegpu.load_nd %arg0, %arg1:
+///         !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
+///     vector.yield %arg0, %arg1
+///   }
+///   xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+///   !xegpu.tensor_desc<4x1xf32>
+///
+/// ```
 struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
   using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,

>From 69cbc3bf6e18d4573f8a5a1616a838ecf6502c18 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 22:48:10 +0000
Subject: [PATCH 08/15] fix

---
 mlir/test/Dialect/XeGPU/xegpu-distribute.mlir | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
index 755f4922bfaa236..f96d149e01c3406 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
@@ -62,6 +62,8 @@ func.func @test_load_nd_distribution(%dst: !xegpu.tensor_desc<24x32xf16, #blk_td
 // CHECK: ^bb0(%[[dst:.*]]: memref<24x32xf16>)
 // CHECK: %[[dead:.*]] = xegpu.create_nd_tdesc
 // CHECK: gpu.yield %[[dead]], %[[dst]] :
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[res]]#1[%[[C12]], 0] : memref<24x32xf16> -> !xegpu.tensor_desc<12x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: return %[[T2]] : !xegpu.tensor_desc<12x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
 
 
 func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>) {

>From e7ca3cd82e0e4b705fcbd2f235bf005e52630995 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 13 Dec 2024 17:10:23 +0000
Subject: [PATCH 09/15] fix

---
 .../Dialect/XeGPU/Transforms/XeGPUDistribute.cpp   | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index e3f48919c392667..38c8799f8882294 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -71,9 +71,8 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 ///                                 -> !xegpu.tensor_desc<4x1xf32>
 ///
 /// ```
-struct WarpOpTensorDescOp final
-    : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
-  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+struct WarpOpTensorDescOp final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override;
 };
@@ -107,9 +106,8 @@ struct WarpOpTensorDescOp final
 ///   !xegpu.tensor_desc<4x1xf32>
 ///
 /// ```
-struct WarpOpStoreNd final
-    : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
-  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+struct WarpOpStoreNd final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override;
 };
@@ -144,8 +142,8 @@ struct WarpOpStoreNd final
 ///   !xegpu.tensor_desc<4x1xf32>
 ///
 /// ```
-struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
-  using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+struct WarpOpLoadNd final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override;
 };

>From b443c7127515d0c3895b2f96e71b17cc62e85018 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 30 Jan 2025 21:10:03 +0000
Subject: [PATCH 10/15] sync

---
 mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 38c8799f8882294..902f14cda97b291 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -213,7 +213,6 @@ LogicalResult WarpOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
 
   if (storeOp.getTensorDescType().getShape().size() != 2)
     return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-  DBGS() << "Matched store_nd: " << storeOp << "\n";
 
   auto distributedTypeOrFailure =
       getDistributedVectorType(storeOp.getValueType(), sgMap);

>From 6f11f3c8008dabd195236afd556cf3ba13e65bfb Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 31 Jan 2025 20:59:54 +0000
Subject: [PATCH 11/15] fix comments

---
 .../XeGPU/Transforms/XeGPUDistribute.cpp       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 902f14cda97b291..6b27d15de1346bc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -32,7 +32,7 @@ namespace {
 bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 
 /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-/// `vector.warp_execute_on_lane_0` and put it after the warp op.
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the
 /// yield op (and should be cleaned up later with dce). The yield op will
@@ -49,7 +49,7 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 ///
 /// ```
 ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-///   %r = vector.warp_execute_on_lane_0(%laneid) ->
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (!xegpu.tensor_desc<4x8xf32>) {
 ///     ...
 ///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
@@ -59,7 +59,7 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
 /// ```
 /// To
 /// ```
-///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
 ///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
@@ -78,7 +78,7 @@ struct WarpOpTensorDescOp final : public gpu::WarpDistributionPattern {
 };
 
 /// Sink a store_nd feeding into vector.yield op for the enclosing
-/// `vector.warp_execute_on_lane_0`. In case arguments for the store are
+/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are
 /// passed
 /// through the warp op interface they would be propagated as returned
 /// values.
@@ -89,7 +89,7 @@ struct WarpOpTensorDescOp final : public gpu::WarpDistributionPattern {
 ///
 /// ```
 ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-///   vector.warp_execute_on_lane_0(%laneid) -> () {
+///   gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
 ///                                 !xegpu.tensor_desc<4x8xf32>
@@ -98,7 +98,7 @@ struct WarpOpTensorDescOp final : public gpu::WarpDistributionPattern {
 /// ```
 /// To
 /// ```
-///   %r = vector.warp_execute_on_lane_0(%laneid) -> () {
+///   %r = gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     vector.yield
 ///   }
@@ -113,7 +113,7 @@ struct WarpOpStoreNd final : public gpu::WarpDistributionPattern {
 };
 
 /// Clone a load_nd feeding into vector.yield op for the enclosing
-/// `vector.warp_execute_on_lane_0` and put it after the warp op.
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
 /// The warp op will still contain the original op that will not be used by
 /// the yield op (and should be cleaned up later with dce). The yield op will
 /// bypass the load's arguments. Both the loaded vector type and tensor
@@ -123,7 +123,7 @@ struct WarpOpStoreNd final : public gpu::WarpDistributionPattern {
 ///
 /// ```
 ///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-///   %r = vector.warp_execute_on_lane_0(%laneid) ->
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
 ///                   (!xegpu.tensor_desc<4x8xf32>) {
 ///     ...
 ///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
@@ -132,7 +132,7 @@ struct WarpOpStoreNd final : public gpu::WarpDistributionPattern {
 /// ```
 /// To
 /// ```
-///   %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
 ///     ...
 ///     %dead = xegpu.load_nd %arg0, %arg1:
 ///         !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>

>From 7d1c7a6acaaf46cacce48df5302b6a0b79124e6d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 31 Jan 2025 22:55:50 +0000
Subject: [PATCH 12/15] add mem side effects interface

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 393 +++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  26 +-
 2 files changed, 228 insertions(+), 191 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index c2335eecc3781d9..d98aa9ffb26f1a5 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -276,97 +276,103 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 }
 
 
-def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "loads a n-D block from memory (represented by TensorDesc)"
-                "to registers (represented by vector)";
-  let description = [{
-    LoadNdOp essentially mimics the hardware block read instruction to read
-    a block of data from memory to register. It takes a set of optional cache
-    hints for each level of cache, L1, L2 and L3. If hardware does not have a
-    correspoding cache, Corresponding cache hint attribute will be masked.
-    VNNI transformation is an hardware feature for Intel GPU, which is used to
-    do data packing during the load for B operand of matrix operation, if
-    the bit width of the data type is less then 32 bits, e.g., fp16. And
-    transpose is another Intel hardware feature, which will do transpose
-    operation when loading the data if the bit width of the data type is
-    fp32 or fp64. It implies that vnni and transpose cannot exit at the
-    same time.
-
-    Example:
-    ```mlir
-      xegpu.load_nd %1 {transpose = [1, 0],
-                        l1_hint = #xegpu.cache_hint<cached>,
-                        l2_hint = #xegpu.cache_hint<uncached>,
-                        l3_hint = #xegpu.cache_hint<streaming>}
-              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
-    ```
-
-
-  }];
-
-  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                       OptionalAttr<UnitAttr>: $packed,
-                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-  let results = (outs XeGPU_ValueType: $value);
-
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    VectorType getType() {
-      return llvm::dyn_cast<VectorType>(getValue().getType());
-    }
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "loads a n-D block from memory (represented by TensorDesc)"
+                  "to registers (represented by vector)";
+    let description = [{
+      LoadNdOp essentially mimics the hardware block read instruction to read
+      a block of data from memory to register. It takes a set of optional cache
+      hints for each level of cache, L1, L2 and L3. If hardware does not have a
+      correspoding cache, Corresponding cache hint attribute will be masked.
+      VNNI transformation is an hardware feature for Intel GPU, which is used to
+      do data packing during the load for B operand of matrix operation, if
+      the bit width of the data type is less then 32 bits, e.g., fp16. And
+      transpose is another Intel hardware feature, which will do transpose
+      operation when loading the data if the bit width of the data type is
+      fp32 or fp64. It implies that vnni and transpose cannot exit at the
+      same time.
+
+      Example:
+      ```mlir
+        xegpu.load_nd %1 {transpose = [1, 0],
+                          l1_hint = #xegpu.cache_hint<cached>,
+                          l2_hint = #xegpu.cache_hint<uncached>,
+                          l3_hint = #xegpu.cache_hint<streaming>}
+                : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+      ```
+
+
+    }];
+
+    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                        OptionalAttr<UnitAttr>: $packed,
+                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+    let results = (outs XeGPU_ValueType: $value);
+
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      VectorType getType() {
+        return llvm::dyn_cast<VectorType>(getValue().getType());
+      }
 
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
-  }];
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
+    }];
 
-  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
-  let hasVerifier = 1;
+    let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+    let hasVerifier = 1;
 }
 
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
-
-  let description = [{
-    StoreNdOp essentially mimics the hardware block write instruction io
-    write a block of data from register into the memory region as described
-    by the TensorDesc. It takes a set of optional cache hints for each level
-    of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
-    Corresponding cache hint attribute will be masked.
-
-    Example:
-    ```mlir
-      xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                             l2_hint = #xegpu.cache_hint<write_back>,
-                             l3_hint = #xegpu.cache_hint<write_through>}
-                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
-    ```
-
-
-  }];
-
-  let arguments = (ins XeGPU_ValueType: $value,
-                       XeGPU_TensorDesc: $TensorDesc,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    VectorType getValueType() {
-      return llvm::dyn_cast<VectorType>(getValue().getType());
-    }
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+
+    let description = [{
+      StoreNdOp essentially mimics the hardware block write instruction io
+      write a block of data from register into the memory region as described
+      by the TensorDesc. It takes a set of optional cache hints for each level
+      of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+      Corresponding cache hint attribute will be masked.
+
+      Example:
+      ```mlir
+        xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                              l2_hint = #xegpu.cache_hint<write_back>,
+                              l3_hint = #xegpu.cache_hint<write_through>}
+                              : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+      ```
+
+
+    }];
+
+    let arguments = (ins XeGPU_ValueType: $value,
+                        XeGPU_TensorDesc: $TensorDesc,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      VectorType getValueType() {
+        return llvm::dyn_cast<VectorType>(getValue().getType());
+      }
 
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
-  }];
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
+    }];
 
-  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
-                        `:` type($value) `,` qualified(type($TensorDesc))}];
-  let hasVerifier = 1;
+    let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+                          `:` type($value) `,` qualified(type($TensorDesc))}];
+    let hasVerifier = 1;
 }
 
 def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
@@ -548,131 +554,138 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
   let hasVerifier = 1;
 }
 
-def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "load a set of scattered data points from memory.";
-
-  let description = [{ It (aka. load) load data per each work-item. The output
-    describes the data being loaded at the subgroup level, so its size is
-    consistent with the number of work-items in a subgroup. When the chunk size
-    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-    to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
-    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
-    due to the hardware implementation. Therefore, a transpose attribute is introduced
-    on purpose, making sure users are aware of this implicit transformation.
-
-    The mask operand masks out memory access so that it is safe to pass out-of-boundary
-    addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
-
-  Example 1:
-  ```mlir
-    %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
-          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
-            vector<16xi1> -> vector<16xf32>
-  ```
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "load a set of scattered data points from memory.";
+
+    let description = [{ It (aka. load) load data per each work-item. The output
+      describes the data being loaded at the subgroup level, so its size is
+      consistent with the number of work-items in a subgroup. When the chunk size
+      is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
+      to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
+      Specially, there is a transpose effect on the result (as compared to the TensorDesc)
+      due to the hardware implementation. Therefore, a transpose attribute is introduced
+      on purpose, making sure users are aware of this implicit transformation.
+
+      The mask operand masks out memory access so that it is safe to pass out-of-boundary
+      addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+    Example 1:
+    ```mlir
+      %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
+                              l2_hint = #xegpu.cache_hint<uncached>,
+                              l3_hint = #xegpu.cache_hint<uncached>}
+            : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+              vector<16xi1> -> vector<16xf32>
+    ```
 
-  Example 2:
-  ```mlir
-    %2 = xegpu.load %1, %0 {transpose,
-                            l1_hint = #xegpu.cache_hint<cached>,
-                            l2_hint = #xegpu.cache_hint<uncached>,
-                            l3_hint = #xegpu.cache_hint<uncached>}
-          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
-            vector<16xi1> -> vector<8x16xf32>
-  ```
+    Example 2:
+    ```mlir
+      %2 = xegpu.load %1, %0 {transpose,
+                              l1_hint = #xegpu.cache_hint<cached>,
+                              l2_hint = #xegpu.cache_hint<uncached>,
+                              l3_hint = #xegpu.cache_hint<uncached>}
+            : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
+              vector<16xi1> -> vector<8x16xf32>
+    ```
 
-  }];
+    }];
 
-  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                       XeGPU_MaskType: $mask,
-                       OptionalAttr<UnitAttr>: $transpose,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-  let results = (outs XeGPU_ValueType: $value);
+    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                        XeGPU_MaskType: $mask,
+                        OptionalAttr<UnitAttr>: $transpose,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+    let results = (outs XeGPU_ValueType: $value);
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
 
-    mlir::Type getElementType() {
-      auto type = getValue().getType();
-      return getElementTypeOrSelf(type);
-    }
+      mlir::Type getElementType() {
+        auto type = getValue().getType();
+        return getElementTypeOrSelf(type);
+      }
 
-    Type getValueType() {
-      return getValue().getType();
-    }
+      Type getValueType() {
+        return getValue().getType();
+      }
 
-    Type getMaskType() {
-      return getMask().getType();
-    }
+      Type getMaskType() {
+        return getMask().getType();
+      }
 
-  }];
+    }];
 
-  let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
-      `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+    let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+        `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
 
-  let hasVerifier = 1;
+    let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
-  let summary = "store data to scattered memory locations.";
-  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
-  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
-  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
-  and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
-  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
-  introduced on purpose, making sure users are aware of this implicit transformation.
-
-  Example 1:
-  ```mlir
-    %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
-  ```
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", 
+  [
+    AllElementTypesMatch<["value", "TensorDesc"]>,
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  ]> {
+    let summary = "store data to scattered memory locations.";
+    let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+    typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
+    a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
+    and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
+    has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
+    introduced on purpose, making sure users are aware of this implicit transformation.
+
+    Example 1:
+    ```mlir
+      %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                                  l2_hint = #xegpu.cache_hint<write_back>,
+                                  l3_hint = #xegpu.cache_hint<write_through>}
+            : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+    ```
 
-  Example 2:
-  ```mlir
-    %3 = xegpu.store %0, %1, %2 {transpose,
-                                 l1_hint = #xegpu.cache_hint<uncached>,
-                                 l2_hint = #xegpu.cache_hint<write_back>,
-                                 l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
-  ```
+    Example 2:
+    ```mlir
+      %3 = xegpu.store %0, %1, %2 {transpose,
+                                  l1_hint = #xegpu.cache_hint<uncached>,
+                                  l2_hint = #xegpu.cache_hint<write_back>,
+                                  l3_hint = #xegpu.cache_hint<write_through>}
+            : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
+    ```
 
-  }];
+    }];
 
-  let arguments = (ins
-    XeGPU_ValueType: $value,
-    XeGPU_TensorDesc: $TensorDesc,
-    XeGPU_MaskType: $mask,
-    OptionalAttr<UnitAttr>: $transpose,
-    OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-    OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-    OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+    let arguments = (ins
+      XeGPU_ValueType: $value,
+      XeGPU_TensorDesc: $TensorDesc,
+      XeGPU_MaskType: $mask,
+      OptionalAttr<UnitAttr>: $transpose,
+      OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+      OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+      OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    xegpu::TensorDescType getTensorDescType() {
-      return getTensorDesc().getType();
-    }
+    let extraClassDeclaration = extraBaseClassDeclaration # [{
+      xegpu::TensorDescType getTensorDescType() {
+        return getTensorDesc().getType();
+      }
 
-    Type getValueType() {
-      return getValue().getType();
-    }
+      Type getValueType() {
+        return getValue().getType();
+      }
 
-    Type getMaskType() {
-      return getMask().getType();
-    }
-  }];
+      Type getMaskType() {
+        return getMask().getType();
+      }
+    }];
 
-  let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
-            `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+    let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+              `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
 
-  let hasVerifier = 1;
+    let hasVerifier = 1;
 }
 
 def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index cd883baa986b85e..d015e5772a94f2a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -324,6 +324,12 @@ LogicalResult LoadNdOp::verify() {
   return success();
 }
 
+void LoadNdOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
@@ -361,6 +367,12 @@ LogicalResult StoreNdOp::verify() {
   return success();
 }
 
+void StoreNdOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Write::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateNDOffsetOp
 //===----------------------------------------------------------------------===//
@@ -494,7 +506,7 @@ LogicalResult PrefetchOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_LoadGatherOp
+// XeGPU_jrOp
 //===----------------------------------------------------------------------===//
 LogicalResult LoadGatherOp::verify() {
   auto tdescTy = getTensorDescType();
@@ -553,6 +565,12 @@ LogicalResult LoadGatherOp::verify() {
   return success();
 }
 
+void LoadGatherOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Read::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreScatterOp
 //===----------------------------------------------------------------------===//
@@ -605,6 +623,12 @@ LogicalResult StoreScatterOp::verify() {
   return success();
 }
 
+void StoreScatterOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  effects.emplace_back(MemoryEffects::Write::get());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//

>From 263d72d4e1521514b7f93b6ed00f896188ccbb33 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Fri, 31 Jan 2025 23:18:30 +0000
Subject: [PATCH 13/15] add mem side effects interface

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 390 +++++++++---------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |   2 +-
 2 files changed, 194 insertions(+), 198 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index d98aa9ffb26f1a5..0ff723005d43597 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -277,102 +277,101 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 
 
 def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
   ]> {
-    let summary = "loads a n-D block from memory (represented by TensorDesc)"
-                  "to registers (represented by vector)";
-    let description = [{
-      LoadNdOp essentially mimics the hardware block read instruction to read
-      a block of data from memory to register. It takes a set of optional cache
-      hints for each level of cache, L1, L2 and L3. If hardware does not have a
-      correspoding cache, Corresponding cache hint attribute will be masked.
-      VNNI transformation is an hardware feature for Intel GPU, which is used to
-      do data packing during the load for B operand of matrix operation, if
-      the bit width of the data type is less then 32 bits, e.g., fp16. And
-      transpose is another Intel hardware feature, which will do transpose
-      operation when loading the data if the bit width of the data type is
-      fp32 or fp64. It implies that vnni and transpose cannot exit at the
-      same time.
-
-      Example:
-      ```mlir
-        xegpu.load_nd %1 {transpose = [1, 0],
-                          l1_hint = #xegpu.cache_hint<cached>,
-                          l2_hint = #xegpu.cache_hint<uncached>,
-                          l3_hint = #xegpu.cache_hint<streaming>}
-                : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
-      ```
-
-
-    }];
-
-    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                        OptionalAttr<UnitAttr>: $packed,
-                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-    let results = (outs XeGPU_ValueType: $value);
-
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      VectorType getType() {
-        return llvm::dyn_cast<VectorType>(getValue().getType());
-      }
+  let summary = "loads a n-D block from memory (represented by TensorDesc)"
+                "to registers (represented by vector)";
+  let description = [{
+    LoadNdOp essentially mimics the hardware block read instruction to read
+    a block of data from memory to register. It takes a set of optional cache
+    hints for each level of cache, L1, L2 and L3. If hardware does not have a
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    VNNI transformation is an hardware feature for Intel GPU, which is used to
+    do data packing during the load for B operand of matrix operation, if
+    the bit width of the data type is less then 32 bits, e.g., fp16. And
+    transpose is another Intel hardware feature, which will do transpose
+    operation when loading the data if the bit width of the data type is
+    fp32 or fp64. It implies that vnni and transpose cannot exit at the
+    same time.
 
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
-    }];
+    Example:
+    ```mlir
+      xegpu.load_nd %1 {transpose = [1, 0],
+                        l1_hint = #xegpu.cache_hint<cached>,
+                        l2_hint = #xegpu.cache_hint<uncached>,
+                        l3_hint = #xegpu.cache_hint<streaming>}
+              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+    ```
+
+
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<UnitAttr>: $packed,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
 
-    let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
-    let hasVerifier = 1;
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+  let hasVerifier = 1;
 }
 
 def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
   ]> {
-    let summary = "stores a n-D block register region back to memory, currently only supports 2D";
-
-    let description = [{
-      StoreNdOp essentially mimics the hardware block write instruction io
-      write a block of data from register into the memory region as described
-      by the TensorDesc. It takes a set of optional cache hints for each level
-      of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
-      Corresponding cache hint attribute will be masked.
-
-      Example:
-      ```mlir
-        xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                              l2_hint = #xegpu.cache_hint<write_back>,
-                              l3_hint = #xegpu.cache_hint<write_through>}
-                              : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
-      ```
-
-
-    }];
-
-    let arguments = (ins XeGPU_ValueType: $value,
-                        XeGPU_TensorDesc: $TensorDesc,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      VectorType getValueType() {
-        return llvm::dyn_cast<VectorType>(getValue().getType());
-      }
+  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
-    }];
+  let description = [{
+    StoreNdOp essentially mimics the hardware block write instruction io
+    write a block of data from register into the memory region as described
+    by the TensorDesc. It takes a set of optional cache hints for each level
+    of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+    Corresponding cache hint attribute will be masked.
+
+    Example:
+    ```mlir
+      xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>,
+                             l3_hint = #xegpu.cache_hint<write_through>}
+                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+    ```
+
+
+  }];
+
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getValueType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
 
-    let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
-                          `:` type($value) `,` qualified(type($TensorDesc))}];
-    let hasVerifier = 1;
+  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+                        `:` type($value) `,` qualified(type($TensorDesc))}];
+  let hasVerifier = 1;
 }
 
 def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
@@ -555,137 +554,134 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
 }
 
 def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
   ]> {
-    let summary = "load a set of scattered data points from memory.";
-
-    let description = [{ It (aka. load) load data per each work-item. The output
-      describes the data being loaded at the subgroup level, so its size is
-      consistent with the number of work-items in a subgroup. When the chunk size
-      is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-      to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
-      Specially, there is a transpose effect on the result (as compared to the TensorDesc)
-      due to the hardware implementation. Therefore, a transpose attribute is introduced
-      on purpose, making sure users are aware of this implicit transformation.
-
-      The mask operand masks out memory access so that it is safe to pass out-of-boundary
-      addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
-
-    Example 1:
-    ```mlir
-      %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
-                              l2_hint = #xegpu.cache_hint<uncached>,
-                              l3_hint = #xegpu.cache_hint<uncached>}
-            : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
-              vector<16xi1> -> vector<16xf32>
-    ```
+  let summary = "load a set of scattered data points from memory.";
+
+  let description = [{ It (aka. load) load data per each work-item. The output
+    describes the data being loaded at the subgroup level, so its size is
+    consistent with the number of work-items in a subgroup. When the chunk size
+    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
+    to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
+    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
+    due to the hardware implementation. Therefore, a transpose attribute is introduced
+    on purpose, making sure users are aware of this implicit transformation.
+
+    The mask operand masks out memory access so that it is safe to pass out-of-boundary
+    addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+  Example 1:
+  ```mlir
+    %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
+                            l2_hint = #xegpu.cache_hint<uncached>,
+                            l3_hint = #xegpu.cache_hint<uncached>}
+          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+            vector<16xi1> -> vector<16xf32>
+  ```
 
-    Example 2:
-    ```mlir
-      %2 = xegpu.load %1, %0 {transpose,
-                              l1_hint = #xegpu.cache_hint<cached>,
-                              l2_hint = #xegpu.cache_hint<uncached>,
-                              l3_hint = #xegpu.cache_hint<uncached>}
-            : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
-              vector<16xi1> -> vector<8x16xf32>
-    ```
+  Example 2:
+  ```mlir
+    %2 = xegpu.load %1, %0 {transpose,
+                            l1_hint = #xegpu.cache_hint<cached>,
+                            l2_hint = #xegpu.cache_hint<uncached>,
+                            l3_hint = #xegpu.cache_hint<uncached>}
+          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
+            vector<16xi1> -> vector<8x16xf32>
+  ```
 
-    }];
+  }];
 
-    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                        XeGPU_MaskType: $mask,
-                        OptionalAttr<UnitAttr>: $transpose,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-    let results = (outs XeGPU_ValueType: $value);
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       XeGPU_MaskType: $mask,
+                       OptionalAttr<UnitAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+  let results = (outs XeGPU_ValueType: $value);
 
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
 
-      mlir::Type getElementType() {
-        auto type = getValue().getType();
-        return getElementTypeOrSelf(type);
-      }
+    mlir::Type getElementType() {
+      auto type = getValue().getType();
+      return getElementTypeOrSelf(type);
+    }
 
-      Type getValueType() {
-        return getValue().getType();
-      }
+    Type getValueType() {
+      return getValue().getType();
+    }
 
-      Type getMaskType() {
-        return getMask().getType();
-      }
+    Type getMaskType() {
+      return getMask().getType();
+    }
 
-    }];
+  }];
 
-    let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
-        `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+  let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+      `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
 
-    let hasVerifier = 1;
+  let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", 
-  [
-    AllElementTypesMatch<["value", "TensorDesc"]>,
-    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
-  ]> {
-    let summary = "store data to scattered memory locations.";
-    let description = [{ It (aka. store) stores data to scattered memory locations. The value is
-    typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
-    a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
-    and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
-    has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
-    introduced on purpose, making sure users are aware of this implicit transformation.
-
-    Example 1:
-    ```mlir
-      %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                                  l2_hint = #xegpu.cache_hint<write_back>,
-                                  l3_hint = #xegpu.cache_hint<write_through>}
-            : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
-    ```
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
+  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let summary = "store data to scattered memory locations.";
+  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
+  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
+  and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
+  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
+  introduced on purpose, making sure users are aware of this implicit transformation.
+
+  Example 1:
+  ```mlir
+    %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                                 l2_hint = #xegpu.cache_hint<write_back>,
+                                 l3_hint = #xegpu.cache_hint<write_through>}
+          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+  ```
 
-    Example 2:
-    ```mlir
-      %3 = xegpu.store %0, %1, %2 {transpose,
-                                  l1_hint = #xegpu.cache_hint<uncached>,
-                                  l2_hint = #xegpu.cache_hint<write_back>,
-                                  l3_hint = #xegpu.cache_hint<write_through>}
-            : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
-    ```
+  Example 2:
+  ```mlir
+    %3 = xegpu.store %0, %1, %2 {transpose,
+                                 l1_hint = #xegpu.cache_hint<uncached>,
+                                 l2_hint = #xegpu.cache_hint<write_back>,
+                                 l3_hint = #xegpu.cache_hint<write_through>}
+          : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
+  ```
 
-    }];
+  }];
 
-    let arguments = (ins
-      XeGPU_ValueType: $value,
-      XeGPU_TensorDesc: $TensorDesc,
-      XeGPU_MaskType: $mask,
-      OptionalAttr<UnitAttr>: $transpose,
-      OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+  let arguments = (ins
+    XeGPU_ValueType: $value,
+    XeGPU_TensorDesc: $TensorDesc,
+    XeGPU_MaskType: $mask,
+    OptionalAttr<UnitAttr>: $transpose,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-    let extraClassDeclaration = extraBaseClassDeclaration # [{
-      xegpu::TensorDescType getTensorDescType() {
-        return getTensorDesc().getType();
-      }
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
 
-      Type getValueType() {
-        return getValue().getType();
-      }
+    Type getValueType() {
+      return getValue().getType();
+    }
 
-      Type getMaskType() {
-        return getMask().getType();
-      }
-    }];
+    Type getMaskType() {
+      return getMask().getType();
+    }
+  }];
 
-    let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
-              `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+  let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+            `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
 
-    let hasVerifier = 1;
+  let hasVerifier = 1;
 }
 
 def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index d015e5772a94f2a..443a1347334e233 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -506,7 +506,7 @@ LogicalResult PrefetchOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_jrOp
+// XeGPU_LoadGatherOp
 //===----------------------------------------------------------------------===//
 LogicalResult LoadGatherOp::verify() {
   auto tdescTy = getTensorDescType();

>From 1b0bba7dc0b17aa4e27af86a8fcb4c17006157e4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 3 Feb 2025 16:48:54 +0000
Subject: [PATCH 14/15] add mem side effects interface

---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 11 ++++-----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 24 -------------------
 2 files changed, 5 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 0ff723005d43597..7560ede058faa3e 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -277,8 +277,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
 
 
 def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
-  AllElementTypesMatch<["value", "TensorDesc"]>,
-  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>
   ]> {
   let summary = "loads a n-D block from memory (represented by TensorDesc)"
                 "to registers (represented by vector)";
@@ -331,7 +330,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
 }
 
 def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
-  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>
   ]> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
@@ -554,8 +553,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
 }
 
 def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
-  AllElementTypesMatch<["value", "TensorDesc"]>,
-  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemRead]>
   ]> {
   let summary = "load a set of scattered data points from memory.";
 
@@ -627,7 +625,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
 }
 
 def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
-  AllElementTypesMatch<["value", "TensorDesc"]>, DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  AllElementTypesMatch<["value", "TensorDesc"]>, MemoryEffects<[MemWrite]>
+  ]> {
   let summary = "store data to scattered memory locations.";
   let description = [{ It (aka. store) stores data to scattered memory locations. The value is
   typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 443a1347334e233..cd883baa986b85e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -324,12 +324,6 @@ LogicalResult LoadNdOp::verify() {
   return success();
 }
 
-void LoadNdOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Read::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
@@ -367,12 +361,6 @@ LogicalResult StoreNdOp::verify() {
   return success();
 }
 
-void StoreNdOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Write::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateNDOffsetOp
 //===----------------------------------------------------------------------===//
@@ -565,12 +553,6 @@ LogicalResult LoadGatherOp::verify() {
   return success();
 }
 
-void LoadGatherOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Read::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_StoreScatterOp
 //===----------------------------------------------------------------------===//
@@ -623,12 +605,6 @@ LogicalResult StoreScatterOp::verify() {
   return success();
 }
 
-void StoreScatterOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  effects.emplace_back(MemoryEffects::Write::get());
-}
-
 //===----------------------------------------------------------------------===//
 // XeGPU_UpdateOffsetOp
 //===----------------------------------------------------------------------===//

>From 2d664e8a3e114c4b36c1f169334dc52fde5c5b58 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 4 Feb 2025 20:10:26 +0000
Subject: [PATCH 15/15] fix issues

---
 .../XeGPU/Transforms/XeGPUDistribute.cpp        | 17 ++++-------------
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp   |  7 +++----
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 6b27d15de1346bc..0558b125e678e7f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -180,19 +180,10 @@ getDistributedTensorDescType(xegpu::TensorDescType originalT,
     else
       distributedShape.push_back(o);
   }
-  xegpu::TensorDescType distributedDescType;
-  if (originalT.isScattered()) {
-
-    distributedDescType = xegpu::TensorDescType::get(
-        distributedShape, originalT.getElementType(), originalT.getChunkSize(),
-        originalT.getMemorySpace(), originalT.getSGMapAttr());
-  } else {
-    distributedDescType = xegpu::TensorDescType::get(
-        distributedShape, originalT.getElementType(),
-        originalT.getBoundaryCheck(), originalT.getArrayLength(),
-        originalT.getMemorySpace(), originalT.getSGMapAttr());
-  }
-  return distributedDescType;
+
+  return xegpu::TensorDescType::get(
+      originalT.getContext(), distributedShape, originalT.getElementType(),
+      originalT.getEncoding(), originalT.getSGMapAttr());
 }
 } // namespace
 
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index a5060c8eeb916ea..36cdec1fefeb3cc 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -26,8 +26,7 @@ struct TestXeGPUDistribution
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUDistribution)
 
   TestXeGPUDistribution() = default;
-  TestXeGPUDistribution(const TestXeGPUDistribution &pass)
-      : PassWrapper(pass) {}
+  TestXeGPUDistribution(const TestXeGPUDistribution &pass) = default;
 
   StringRef getArgument() const final { return "test-xegpu-distribute"; }
   StringRef getDescription() const final {
@@ -44,7 +43,7 @@ struct TestXeGPUDistribution
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populateXeGPUDistributePatterns(patterns);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
 } // namespace
@@ -55,4 +54,4 @@ void registerTestXeGPUTransforms() {
   PassRegistration<TestXeGPUDistribution>();
 }
 } // namespace test
-} // namespace mlir
\ No newline at end of file
+} // namespace mlir