[Mlir-commits] [mlir] [MLIR][XeGPU] Introduce `xegpu::uArch` usage in target-sensitive passes (PR #163801)

Wed Oct 29 07:57:39 PDT 2025

https://github.com/akroviakov updated https://github.com/llvm/llvm-project/pull/163801

>From 62be14dc1779771b08a9bd3f4ddc827fba6cf463 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 28 Oct 2025 09:09:40 +0000
Subject: [PATCH 1/3] Update PR to use the new uArch

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  22 +-
 .../mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h   |  30 ---
 .../mlir/Dialect/XeGPU/Transforms/Passes.td   |   7 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  16 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 230 +++++++++++++-----
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  27 +-
 .../XeGPU/move-gpu-func-to-warp-op.mlir       |   2 +-
 .../XeGPU/propagate-layout-inst-data.mlir     |  89 +++++++
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |  82 +++++--
 9 files changed, 359 insertions(+), 146 deletions(-)
 delete mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
 create mode 100644 mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 19a52317956d2..40352b44b6441 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -379,28 +379,28 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
   );
 
   let builders = [
-    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
+    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data,
+                      "llvm::ArrayRef<int32_t>": $lane_layout,
                      "llvm::ArrayRef<int32_t>": $lane_data),
       [{
         auto sg_layout = DenseI32ArrayAttr();
         auto sg_data = DenseI32ArrayAttr();
-        auto inst_data = DenseI32ArrayAttr();
         auto order = DenseI32ArrayAttr();
-        return $_get($_ctxt, sg_layout, sg_data, inst_data,
+        return $_get($_ctxt, sg_layout, sg_data,
+                     DenseI32ArrayAttr::get($_ctxt, inst_data),
                      DenseI32ArrayAttr::get($_ctxt, lane_layout),
                      DenseI32ArrayAttr::get($_ctxt, lane_data), order);
       }]>,
     AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $lane_layout,
-                     "llvm::ArrayRef<int32_t>": $lane_data,
-                     "llvm::ArrayRef<int32_t>": $order),
+                     "llvm::ArrayRef<int32_t>": $lane_data),
       [{
-        return $_get($_ctxt,
-                     /*sg_layout =*/ nullptr,
-                     /*sg_data   =*/ nullptr,
-                     /*inst_data =*/ nullptr,
+        auto sg_layout = DenseI32ArrayAttr();
+        auto sg_data = DenseI32ArrayAttr();
+        auto inst_data = DenseI32ArrayAttr();
+        auto order = DenseI32ArrayAttr();
+        return $_get($_ctxt, sg_layout, sg_data, inst_data,
                      DenseI32ArrayAttr::get($_ctxt, lane_layout),
-                     DenseI32ArrayAttr::get($_ctxt, lane_data),
-                     DenseI32ArrayAttr::get($_ctxt, order));
+                     DenseI32ArrayAttr::get($_ctxt, lane_data), order);
       }]>,
     AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout,
                      "DenseI32ArrayAttr": $lane_data,
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
deleted file mode 100644
index 8aa9536cb67c1..0000000000000
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
-
-namespace mlir {
-namespace xegpu {
-/// HW dependent constants.
-/// TODO: These constants should be queried from the target information.
-namespace targetinfo {
-constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
-/// If DPAS A or B operands have low precision element types they must be packed
-/// according to the following sizes.
-constexpr unsigned packedSizeInBitsForDefault =
-    16; // Minimum packing size per register for DPAS A.
-constexpr unsigned packedSizeInBitsForDpasB =
-    32; // Minimum packing size per register for DPAS B.
-constexpr unsigned packedSizeInBitsForGatherScatter =
-    32; // Minimum packing size per register for Gather and Scatter ops.
-} // namespace targetinfo
-} // namespace xegpu
-} // namespace mlir
-
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 564d9c4d5422b..b7af5413669c9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
   let options = [Option<
     "printOnly", "print-analysis-only", "bool",
     /*default=*/"false",
-    "Print the result of layout propagation analysis and exit.">];
+    "Print the result of layout propagation analysis and exit.">,
+    Option<
+    "layoutKind", "layout-kind", "std::string",
+    /*default=*/"\"lane\"",
+    "Propagate a `sg` / `inst` / `lane` level of xegpu layouts.">
+  ];
 }
 
 def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index f9aa28d5203db..ea9c872aae66f 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -11,7 +11,7 @@
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -229,8 +229,10 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
   }
 
   if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) {
-    return emitError()
-           << "expected inst_data and lane_layout to have the same rank";
+    return emitError() << "expected inst_data and lane_layout to have the same "
+                          "rank, got inst_data "
+                       << inst_data.size() << ", lane_layout "
+                       << lane_layout.size();
   }
 
   // sg_data is optional for Workgroup layout, but its presence requires
@@ -568,10 +570,10 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
 
   // for gather and scatter ops, Low-precision types are packed in 32-bit units.
   unsigned bitWidth = elementType.getIntOrFloatBitWidth();
-  int chunkAlignmentFactor =
-      bitWidth < targetinfo::packedSizeInBitsForGatherScatter
-          ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth
-          : 1;
+  constexpr int packingBitSizeGatherScatter{32};
+  int chunkAlignmentFactor = bitWidth < packingBitSizeGatherScatter
+                                 ? packingBitSizeGatherScatter / bitWidth
+                                 : 1;
   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
   if (scatterAttr) {
     int64_t chunkSize = scatterAttr.getChunkSizeAsInt();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 8fab255d6347f..070535bdf2049 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/IR/Attributes.h"
@@ -37,6 +36,8 @@
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+
 namespace mlir {
 namespace xegpu {
 #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
@@ -104,6 +105,8 @@ struct LayoutInfo {
 
   SmallVector<int> getLaneData() const;
 
+  SmallVector<int> getInstData() const;
+
   bool isSliceLayout() const {
     if (!isAssigned())
       return false;
@@ -137,6 +140,13 @@ SmallVector<int> LayoutInfo::getLaneData() const {
                              [](int64_t val) { return static_cast<int>(val); });
 }
 
+SmallVector<int> LayoutInfo::getInstData() const {
+  if (!isAssigned())
+    return {};
+  return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
+                             [](int64_t val) { return static_cast<int>(val); });
+}
+
 void LayoutInfo::print(raw_ostream &os) const {
   if (isAssigned()) {
     os << storage;
@@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
 
   SmallVector<int32_t> laneLayout;
   SmallVector<int32_t> laneData;
+  SmallVector<int32_t> instData;
   for (int64_t idx : permutation) {
     laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
     laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+    instData.push_back(static_cast<int32_t>(getInstData()[idx]));
   }
-  return LayoutInfo(
-      xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData));
+  return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData,
+                                           laneLayout, laneData));
 }
 
 //===----------------------------------------------------------------------===//
@@ -200,18 +212,32 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
 /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
 /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
 static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
-                                           unsigned rank) {
+                                           unsigned rank,
+                                           const xegpu::uArch::uArch *uArch,
+                                           ArrayRef<int> instData) {
   assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
   if (rank == 1) {
     return LayoutInfo(
-        xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1}));
+        xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1}));
   }
   return LayoutInfo(xegpu::LayoutAttr::get(
-      ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1}));
+      ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1}));
+}
+
+static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
+                                           unsigned rank, int subgroupSize) {
+  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
+  if (rank == 1) {
+    return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
+  }
+  return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
 }
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
+                                           const xegpu::uArch::uArch *uArch,
+                                           ArrayRef<int> instData,
+                                           unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
@@ -221,28 +247,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (vectorTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1);
+    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData);
   // Packing factor is determined by the element type bitwidth.
-  int packingFactor = 1;
   unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    packingFactor =
-        bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
-            ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
-            : 1;
-    return LayoutInfo(xegpu::LayoutAttr::get(
-        vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
-        {1, packingFactor}));
+    return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+                                             {uArch->getSubgroupSize(), 1},
+                                             {1, packingFactor}));
   }
-  if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
-    packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth;
-  return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
-                                           {1, xegpu::targetinfo::subgroupSize},
+  return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+                                           {1, uArch->getSubgroupSize()},
                                            {1, packingFactor}));
 }
 
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
+                                           const xegpu::uArch::uArch *uArch,
+                                           ArrayRef<int> instData,
+                                           unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
   assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) &&
@@ -252,27 +275,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (tdescTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1);
+    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData);
   // Packing factor is determined by the element type bitwidth.
   unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
-
+  int subgroupSize = uArch->getSubgroupSize();
+  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    int packingFactor =
-        bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter
-            ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth
-            : 1;
     return LayoutInfo(xegpu::LayoutAttr::get(
-        tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1},
-        {1, packingFactor}));
+        tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor}));
   }
 
-  int packingFactor =
-      (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault)
-          ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth
-          : 1;
-  return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(),
-                                           {1, xegpu::targetinfo::subgroupSize},
-                                           {1, packingFactor}));
+  return LayoutInfo(xegpu::LayoutAttr::get(
+      tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor}));
 }
 
 /// Helper Function to get the expected layouts for DPAS operands. `lane_data`
@@ -281,25 +295,25 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
 /// `packedSizeInBitsForDefault`
 /// * For B operand, the data must be packed in minimum
 /// `packedSizeInBitsForDpasB`
-static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy,
-                                                  unsigned operandNum) {
+static LayoutInfo
+getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
+                                const xegpu::uArch::uArch *uArch,
+                                ArrayRef<int> instData, unsigned packingSize) {
   Type elementTy = vectorTy.getElementType();
   assert(elementTy.isIntOrFloat() &&
          "Expected int or float type in DPAS operands");
-  SmallVector<int32_t, 2> layout({1, xegpu::targetinfo::subgroupSize});
+  SmallVector<int32_t, 2> layout({1, uArch->getSubgroupSize()});
   // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
   // must have the VNNI format.
-  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() <
-                             xegpu::targetinfo::packedSizeInBitsForDpasB) {
+  if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < packingSize) {
     SmallVector<int32_t, 2> data(
-        {static_cast<int32_t>(xegpu::targetinfo::packedSizeInBitsForDpasB /
-                              elementTy.getIntOrFloatBitWidth()),
+        {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()),
          1});
     return LayoutInfo(
-        xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data));
+        xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data));
   }
   // Otherwise, return the default layout for the vector type.
-  return getDefaultSIMTLayoutInfo(vectorTy);
+  return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize);
 }
 
 //===----------------------------------------------------------------------===//
@@ -456,7 +470,26 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   // Here we assign the default layout to the tensor descriptor operand of
   // prefetch.
   auto tdescTy = prefetch.getTensorDescType();
-  auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy);
+
+  auto uArch = getUArch(getChipStr(prefetch).value_or(""));
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
+          uArch->getInstruction(
+              xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));
+
+  auto blockWHC =
+      uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
+  if (!blockWHC)
+    prefetch.emitWarning("No known block params found for the element type.");
+  auto [bWidth, bHeight, bCount] = blockWHC.value();
+
+  SmallVector<int> instData;
+  if (tdescTy.getRank() == 1)
+    instData = {bWidth.back() * bCount.back()};
+  else
+    instData = {bHeight.back(), bWidth.back() * bCount.back()};
+  auto prefetchLayout = getDefaultSIMTLayoutInfo(
+      tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize());
   // Propagate the layout to the source tensor descriptor.
   propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
 }
@@ -475,10 +508,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
     reduction.emitWarning("Expecting output type to be 1D vector.");
     return;
   }
+  auto uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
   // Given that the result is 1D, the layout of the operand should be 2D with
   // default layout.
-  LayoutInfo operandLayout =
-      getDefaultSIMTLayoutInfo(reduction->getContext(), 2);
+  LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(
+      reduction->getContext(), 2, uArch->getSubgroupSize());
   propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
   // Accumulator should have the same layout as the result.
   propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
@@ -557,15 +591,36 @@ void LayoutInfoPropagation::visitDpasOp(
     ArrayRef<const LayoutInfoLattice *> results) {
   VectorType aTy = dpas.getLhsType();
   VectorType bTy = dpas.getRhsType();
-  propagateIfChanged(
-      operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0)));
-  propagateIfChanged(
-      operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1)));
+
+  auto uArch = getUArch(getChipStr(dpas).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
+          xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+  const int maxALen =
+      uArchInstruction->getSupportedM(aTy.getElementType()).back();
+  const int maxBLen =
+      uArchInstruction->getSupportedK(bTy.getElementType()).back();
+  SmallVector<int> instDataA = {maxALen, subgroupSize};
+  SmallVector<int> instDataB = {subgroupSize, maxBLen};
+
+  propagateIfChanged(operands[0],
+                     operands[0]->meet(getSIMTLayoutInfoForDPASOperand(
+                         aTy, 0, uArch, instDataA,
+                         uArchInstruction->getPackedFormatBitSizeA())));
+  propagateIfChanged(operands[1],
+                     operands[1]->meet(getSIMTLayoutInfoForDPASOperand(
+                         bTy, 1, uArch, instDataB,
+                         uArchInstruction->getPackedFormatBitSizeB())));
   if (operands.size() > 2) {
     VectorType cTy = dpas.getAccType();
-    propagateIfChanged(
-        operands[2],
-        operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2)));
+    const int maxCLen =
+        uArchInstruction->getSupportedN(bTy.getElementType()).back();
+    SmallVector<int> instDataC = {maxALen, maxCLen};
+    propagateIfChanged(operands[2],
+                       operands[2]->meet(getSIMTLayoutInfoForDPASOperand(
+                           cTy, 2, uArch, instDataC,
+                           uArchInstruction->getPackedFormatBitSizeB())));
   }
 }
 
@@ -573,7 +628,25 @@ void LayoutInfoPropagation::visitDpasOp(
 void LayoutInfoPropagation::visitStoreNdOp(
     xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
     ArrayRef<const LayoutInfoLattice *> results) {
-  LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType());
+
+  auto uArch = getUArch(getChipStr(store).value_or(""));
+  const auto *uArchInstruction =
+      dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
+          uArch->getInstruction(
+              xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+  auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
+      store.getValueType().getElementType());
+  if (!blockWHC)
+    store.emitWarning("No known block params found for the element type.");
+  auto [bWidth, bHeight, bCount] = blockWHC.value();
+  SmallVector<int> instData;
+  if (store.getValueType().getRank() == 1)
+    instData = {bWidth.back() * bCount.back()};
+  else
+    instData = {bHeight.back(), bWidth.back() * bCount.back()};
+  LayoutInfo storeLayout =
+      getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData,
+                               uArchInstruction->getPackedFormatBitSize());
   // Both operands should have the same layout
   for (LayoutInfoLattice *operand : operands)
     propagateIfChanged(operand, operand->meet(storeLayout));
@@ -694,10 +767,23 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     load.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true);
+  auto uArch = getUArch(getChipStr(load).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+  SmallVector<int> instData{subgroupSize};
+  if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1)
+    instData.push_back(chunkSize);
+  else if (auto srcTdescTy =
+               dyn_cast<xegpu::TensorDescType>(load.getSourceType())) {
+    if (srcTdescTy.getChunkSizeAsInt() > 1)
+      instData.push_back(chunkSize);
+  }
+  LayoutInfo layout = getDefaultSIMTLayoutInfo(
+      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+      /*scattered*/ true);
 
   // Mask operand should have 1D default layout.
-  LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1);
+  LayoutInfo maskLayout =
+      getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize);
 
   // Propagate the new layout to the tensor descriptor operand.
   if (isa<xegpu::TensorDescType>(load.getSourceType()))
@@ -717,8 +803,10 @@ void LayoutInfoPropagation::visitCreateDescOp(
   // Need the layout of the descriptor to propagate to the operands.
   if (!descLayout.isAssigned())
     return;
+  auto uArch = getUArch(getChipStr(createDesc).value_or(""));
   // For offset operand propagate 1D default layout.
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1);
+  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
+                                               uArch->getSubgroupSize());
   propagateIfChanged(operands[1], operands[1]->meet(layout));
 }
 
@@ -735,18 +823,30 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
+  auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
+  const int subgroupSize = uArch->getSubgroupSize();
+
   auto payloadShape = payloadTy.getShape();
   if (payloadShape.size() > 1)
     assert(
-        payloadShape[0] == xegpu::targetinfo::subgroupSize &&
+        payloadShape[0] == subgroupSize &&
         "Expected the first dimension of 2D tensor descriptor to be equal to "
         "subgroup size.");
 
-  LayoutInfo payloadLayout =
-      getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true);
+  SmallVector<int> instData{subgroupSize};
+  if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1)
+    instData.push_back(chunkSize);
+  else if (auto dstTdescTy =
+               dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) {
+    if (dstTdescTy.getChunkSizeAsInt() > 1)
+      instData.push_back(chunkSize);
+  }
+  LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(
+      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+      /*scattered=*/true);
 
   LayoutInfo maskLayout =
-      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1);
+      getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
   // Propagate the payload operand layout
   propagateIfChanged(operands[0], operands[0]->meet(payloadLayout));
   // Propagate the destination (if tdesc) operand layout
@@ -1023,9 +1123,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() {
     LayoutInfo layout = analysis.getLayoutInfo(val);
     if (!layout.isAssigned())
       return {};
+    xegpu::DistributeLayoutAttr layoutAttr =
+        cast<xegpu::DistributeLayoutAttr>(layout.get());
+    if (this->layoutKind == "lane")
+      layoutAttr = layoutAttr.dropInstData();
     if (layout.isSliceLayout())
-      return cast<xegpu::SliceAttr>(layout.get());
-    return cast<xegpu::LayoutAttr>(layout.get());
+      return cast<xegpu::SliceAttr>(layoutAttr);
+    return cast<xegpu::LayoutAttr>(layoutAttr);
   };
 
   mlir::OpBuilder builder(&getContext());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d09dc196c0bf7..1c5b7e6e86a6c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -11,10 +11,10 @@
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h"
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -159,17 +159,18 @@ static bool requirePacked(const xegpu::LayoutAttr layout) {
 
 /// Helper function to check if the layout requires a transpose effect.
 static bool requireTranspose(const xegpu::LayoutAttr layout,
-                             const std::string &chipStr) {
+                             const xegpu::uArch::uArch *uArch) {
   // Return false for unsupported targets.
   // TODO: Add more support or move to target info.
-  if (chipStr != "pvc" && chipStr != "bmg")
+  if (uArch->getName().equals_insensitive("pvc") &&
+      uArch->getName().equals_insensitive("bmg"))
     return false;
   if (!layout)
     return false;
   auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
   if (laneLayout.size() != 2)
     return false;
-  return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1;
+  return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
 }
 
 /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
@@ -228,9 +229,14 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
         rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
         /** upperBound = **/ mlir::IntegerAttr());
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          gpuFuncOp, "Subgroup distribution requires target attribute attached "
+                     "to set the warp size");
     auto warpOp = gpu::WarpExecuteOnLane0Op::create(
         rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
-        xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(),
+        uArch->getSubgroupSize(), newGpuFunc.getArguments(),
         newGpuFunc.getArgumentTypes());
     Block &warpBodyBlock = warpOp.getBodyRegion().front();
     // Replace the ReturnOp of the original gpu function with a YieldOp.
@@ -498,11 +504,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     // Chip information is required to decide if the layout requires transpose
     // effect.
     auto chipStr = xegpu::getChipStr(loadOp);
-    if (!chipStr)
+    auto uArch = getUArch(chipStr.value_or(""));
+    if (!uArch)
       return rewriter.notifyMatchFailure(
-          loadOp,
-          "xegpu::LoadNdOp require chip information to determine transpose "
-          "requirement");
+          loadOp, "xegpu::LoadNdOp require target attribute attached to "
+                  "determine transpose "
+                  "requirement");
     // Expecting offsets to be present.
     SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
     if (offsets.empty())
@@ -556,7 +563,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
     // Set the packed attribute if the layout requires it.
     newLoadOp.setPacked(requirePacked(layout));
     // Set the transpose attribute if the layout requires it.
-    if (requireTranspose(layout, chipStr.value()))
+    if (requireTranspose(layout, uArch))
       newLoadOp.setTranspose(
           DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
     Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
index d289d73e863c7..2780212d2917f 100644
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s
 
 gpu.module @test {
 gpu.func @empty()  {
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
new file mode 100644
index 0000000000000..ad7a0343a2ef1
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @dpas_f16(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+gpu.module @test {
+
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+  %2 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+  %3 = xegpu.load_nd %1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+  %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+  xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  return
+}
+}
+
+// -----
+gpu.module @test_kernel {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c32 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+    %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16>
+
+    %out:3 = scf.for %k = %c0 to %c1024 step %c32
+      iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
+      -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16>
+      %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+      %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
+
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16>
+      %c = arith.addf %a, %b : vector<16x32xf16>
+
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
+      scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc
+        : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
+    }
+    gpu.return
+  }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_chunksize(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+      : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+  xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+      : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 30f785ded975a..543e119d81d88 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s
 
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
@@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @dpas_i8(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) {
 // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16],
@@ -37,8 +40,10 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre
   xegpu.store_nd %0, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_with_transpose_effect(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
@@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x
   xegpu.store_nd %4, %5  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_transpose(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> to vector<16x16xf16>
@@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @extf_truncf(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, %[[ARG1:[0-9a-zA-Z]+]]:
 // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>) -> vector<8x16xf32> {
@@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor
   %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   return %4 : vector<8x16xf32>
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256
   xegpu.store_nd %5, %6  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @load_gather_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
@@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf
   xegpu.store_nd %1, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_with_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> ->
@@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) {
   xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @store_scatter_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>,
@@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   xegpu.store %arg0, %0, %cst_0  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops_chunksize(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
       : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scatter_ops(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
@@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) {
   xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
 // CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
@@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
@@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8
   xegpu.store_nd %6, %7  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32(
 // CHECK:      %[[LOAD:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16>
@@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16
   xegpu.store_nd %3, %1  : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle(
 // CHECK:     %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32>
 // CHECK:     %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a
   xegpu.store_nd %3, %1  : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16>
   return
 }
-
+}
 
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_one_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.
   xegpu.store_nd %4, %arg2  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @binary_op_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !
   xegpu.store_nd %2, %arg3  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @for_op(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me
   xegpu.store_nd %2#2, %3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_single_use(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>,
@@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens
   xegpu.store_nd %2, %arg3  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @if_multiple_uses(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
@@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
   xegpu.store_nd %1, %arg4  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_outer_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [0] : vector<16x16xf32> to vector<16xf32>
@@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_inner_reduction(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>) {
 // CHECK: %{{.*}} = vector.multi_reduction <add>, %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} [1] : vector<16x16xf32> to vector<16xf32>
@@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){
   xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @update_nd_offset_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){
   xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_2d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @prefetch_1d(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
 // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
@@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){
   xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @scf_while_and_condition(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) {
 // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>)
@@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32
   }
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
-
+}
 // -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(
 // CHECK-SAME:     %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
 // CHECK-SAME:     %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
@@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
   return
 }
+}

>From a6c9e49658f42e963d254e620b75a1ee1f5e139c Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 28 Oct 2025 09:30:53 +0000
Subject: [PATCH 2/3] Feedback part1

---
 .../mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h      |  2 ++
 .../Transforms/XeGPUSubgroupDistribute.cpp      | 17 ++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
index dcb2ad5d67a25..b3231a173f33a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h
@@ -270,6 +270,8 @@ inline const uArch *getUArch(llvm::StringRef archName) {
     return PVCuArch::getInstance();
   else if (archName.equals_insensitive("bmg"))
     return BMGuArch::getInstance();
+  else
+    llvm_unreachable("No matching uArch found");
 
   return nullptr;
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 1c5b7e6e86a6c..5a3b27ec6108e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -200,6 +200,11 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
   using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                 PatternRewriter &rewriter) const override {
+    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
+    if (!uArch)
+      return rewriter.notifyMatchFailure(
+          gpuFuncOp, "Subgroup distribution requires target attribute attached "
+                     "to set the warp size");
     // If the function only contains a single void return, skip.
     if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
           return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
@@ -229,11 +234,6 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
         rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
         /** upperBound = **/ mlir::IntegerAttr());
     ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
-    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
-    if (!uArch)
-      return rewriter.notifyMatchFailure(
-          gpuFuncOp, "Subgroup distribution requires target attribute attached "
-                     "to set the warp size");
     auto warpOp = gpu::WarpExecuteOnLane0Op::create(
         rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
         uArch->getSubgroupSize(), newGpuFunc.getArguments(),
@@ -501,15 +501,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
           warpOp, "warp result is not a xegpu::LoadNd op");
 
     auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
-    // Chip information is required to decide if the layout requires transpose
-    // effect.
-    auto chipStr = xegpu::getChipStr(loadOp);
-    auto uArch = getUArch(chipStr.value_or(""));
+    auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
     if (!uArch)
       return rewriter.notifyMatchFailure(
           loadOp, "xegpu::LoadNdOp require target attribute attached to "
                   "determine transpose "
                   "requirement");
+    // Chip information is required to decide if the layout requires transpose
+    // effect.
     // Expecting offsets to be present.
     SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
     if (offsets.empty())

>From c9bd5aa06350243547cbedea87e88751bcb5950d Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Wed, 29 Oct 2025 14:57:19 +0000
Subject: [PATCH 3/3] Find suitable multiple

---
 .../mlir/Dialect/XeGPU/uArch/uArchBase.h      |  2 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  8 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 85 ++++++++++++++++---
 3 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
index ea33e885c78ff..8f23b89134773 100644
--- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
+++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h
@@ -29,6 +29,8 @@ namespace mlir {
 namespace xegpu {
 namespace uArch {
 
+constexpr unsigned generalPackedFormatBitSize{32};
+
 // An enum class to represent the scope of an instruction
 enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster };
 enum class InstructionKind {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ea9c872aae66f..6b4c185d7d897 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -570,10 +570,10 @@ TensorDescType::verify(llvm::function_ref<InFlightDiagnostic()> emitError,
 
   // for gather and scatter ops, Low-precision types are packed in 32-bit units.
   unsigned bitWidth = elementType.getIntOrFloatBitWidth();
-  constexpr int packingBitSizeGatherScatter{32};
-  int chunkAlignmentFactor = bitWidth < packingBitSizeGatherScatter
-                                 ? packingBitSizeGatherScatter / bitWidth
-                                 : 1;
+  int chunkAlignmentFactor =
+      bitWidth < xegpu::uArch::generalPackedFormatBitSize
+          ? xegpu::uArch::generalPackedFormatBitSize / bitWidth
+          : 1;
   auto scatterAttr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(encoding);
   if (scatterAttr) {
     int64_t chunkSize = scatterAttr.getChunkSizeAsInt();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 070535bdf2049..90eae871a5ef3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -204,6 +204,28 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
   using Lattice::Lattice;
 };
 
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape. `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+                      ArrayRef<T> candidateMultiples = {}) {
+  static_assert(std::is_integral<T>::value, "T must be an integer type");
+  int largest = -1;
+  SmallVector<T> multiples = {1};
+  if (!candidateMultiples.empty())
+    multiples =
+        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+  for (T candidate : candidates) {
+    for (T multiple : multiples) {
+      int value = static_cast<int>(candidate * multiple);
+      if (value != 0 && dim % value == 0 && value > largest)
+        largest = value;
+    }
+  }
+  return largest;
+}
+
 /// Helper Functions to get default layouts. A `default layout` is a layout that
 /// is assigned to a value when the layout is not fixed by some anchor operation
 /// (like DPAS).
@@ -482,12 +504,23 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   if (!blockWHC)
     prefetch.emitWarning("No known block params found for the element type.");
   auto [bWidth, bHeight, bCount] = blockWHC.value();
-
   SmallVector<int> instData;
+  int instWidth = getLargestDivisor(
+      static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
+      bCount);
+  if (instWidth == -1)
+    prefetch.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
   if (tdescTy.getRank() == 1)
-    instData = {bWidth.back() * bCount.back()};
-  else
-    instData = {bHeight.back(), bWidth.back() * bCount.back()};
+    instData = {instWidth};
+  else {
+    int instHeight = getLargestDivisor(
+        static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
+    if (instHeight == -1)
+      prefetch.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    instData = {instHeight, instWidth};
+  }
   auto prefetchLayout = getDefaultSIMTLayoutInfo(
       tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize());
   // Propagate the layout to the source tensor descriptor.
@@ -597,10 +630,22 @@ void LayoutInfoPropagation::visitDpasOp(
   const auto *uArchInstruction =
       dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
           xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
+
+  const unsigned dataALen = aTy.getShape().front();
+  auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
   const int maxALen =
-      uArchInstruction->getSupportedM(aTy.getElementType()).back();
+      getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+  if (maxALen == -1)
+    dpas.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+
+  const unsigned dataBLen = bTy.getShape().back();
+  auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
   const int maxBLen =
-      uArchInstruction->getSupportedK(bTy.getElementType()).back();
+      getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+  if (maxBLen == -1)
+    dpas.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
   SmallVector<int> instDataA = {maxALen, subgroupSize};
   SmallVector<int> instDataB = {subgroupSize, maxBLen};
 
@@ -614,8 +659,13 @@ void LayoutInfoPropagation::visitDpasOp(
                          uArchInstruction->getPackedFormatBitSizeB())));
   if (operands.size() > 2) {
     VectorType cTy = dpas.getAccType();
+    const unsigned dataCLen = bTy.getShape().back();
+    auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
     const int maxCLen =
-        uArchInstruction->getSupportedN(bTy.getElementType()).back();
+        getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+    if (maxCLen == -1)
+      dpas.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
     SmallVector<int> instDataC = {maxALen, maxCLen};
     propagateIfChanged(operands[2],
                        operands[2]->meet(getSIMTLayoutInfoForDPASOperand(
@@ -634,16 +684,29 @@ void LayoutInfoPropagation::visitStoreNdOp(
       dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
           uArch->getInstruction(
               xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
+  VectorType dataTy = store.getValueType();
   auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
       store.getValueType().getElementType());
   if (!blockWHC)
     store.emitWarning("No known block params found for the element type.");
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
-  if (store.getValueType().getRank() == 1)
-    instData = {bWidth.back() * bCount.back()};
-  else
-    instData = {bHeight.back(), bWidth.back() * bCount.back()};
+  int instWidth = getLargestDivisor(
+      static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
+      bCount);
+  if (instWidth == -1)
+    store.emitWarning(
+        "No suitable instruction multiple found for the given shape.");
+  if (dataTy.getRank() == 1)
+    instData = {instWidth};
+  else {
+    int instHeight = getLargestDivisor(
+        static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
+    if (instHeight == -1)
+      store.emitWarning(
+          "No suitable instruction multiple found for the given shape.");
+    instData = {instHeight, instWidth};
+  }
   LayoutInfo storeLayout =
       getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData,
                                uArchInstruction->getPackedFormatBitSize());