[Mlir-commits] [mlir] d30554b - [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (#135271)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Apr 30 12:16:50 PDT 2025
Author: Charitha Saumya
Date: 2025-04-30T12:16:47-07:00
New Revision: d30554b19edc27bc9ca3475b888c1b3e4eda87c4
URL: https://github.com/llvm/llvm-project/commit/d30554b19edc27bc9ca3475b888c1b3e4eda87c4
DIFF: https://github.com/llvm/llvm-project/commit/d30554b19edc27bc9ca3475b888c1b3e4eda87c4.diff
LOG: [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (#135271)
This PR adds the SIMT distribution patterns for create_nd_tdesc, load_nd, store_nd and dpas XeGPU ops.
Added:
mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
Modified:
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
mlir/lib/Dialect/XeGPU/CMakeLists.txt
mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 3cb71788a15ef..ecab280b76f55 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -189,11 +189,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return scatter_attr.getChunkSize().getInt();
return 1;
}
-
- // This returns a vector type that represents the fragment of data owned by
- // a work item in SIMT mode if this tensor descriptor is used in a XeGPU
- // load/store operation.
- FailureOr<VectorType> getDistributedVectorType();
}];
let hasCustomAssemblyFormat = true;
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 63ea26df06937..3e94021c7a1ea 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -16,6 +16,8 @@ namespace xegpu {
/// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
+/// Appends patterns for XeGPU SIMT distribution into `patterns`.
+void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
new file mode 100644
index 0000000000000..3616fa614e7f9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -0,0 +1,57 @@
+//===- XeGPUUtils.h - Vector Utilities --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
+#define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+namespace mlir {
+
+class VectorType;
+namespace xegpu {
+class LayoutAttr;
+class TensorDescType;
+} // namespace xegpu
+
+namespace xegpu {
+
+/// If tensor descriptor has a layout attribute it is used in SIMT mode.
+/// In this mode, the distributed vector shape is determined as follows:
+/// Definitions:
+/// lane_data_size = lane_data[0] × lane_data[1]
+/// subgroup_size = lane_layout[0] × lane_layout[1]
+/// distribution_unit_size = subgroup_size × lane_data_size
+///
+/// Case 1: Regular loads/stores.
+/// The following conditions must be met:
+/// * tensor_desc[0] == lane_layout[0]
+/// Distributed vector is a 1D vector with shape:
+/// [chunk_size]
+///
+/// Case 2: Block loads/stores
+/// Additional definitions:
+/// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
+/// n_distribution_units = tensor_size / distribution_unit_size
+/// fragment_size = n_distribution_units * lane_data_size
+/// Given above definitions, the following conditions must be met:
+/// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
+/// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
+/// Distributed vector is a 1D vector with shape:
+/// [fragment_size]
+FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
+
+/// Helper to get the distributed vector type for a given vector type according
+/// to a given LayoutAttr.
+FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
+ LayoutAttr layout);
+
+} // namespace xegpu
+
+} // namespace mlir
+
+#endif // MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_
diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
index 9f57627c321fb..31167e6af908b 100644
--- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt
@@ -1,2 +1,3 @@
add_subdirectory(IR)
add_subdirectory(Transforms)
+add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index b2d217d192934..6790c5e3af2c0 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -376,74 +376,6 @@ LogicalResult TensorDescType::verify(
return success();
}
-// If tensor descriptor has a layout attribute it is used in SIMT mode.
-// In this mode, the distributed vector shape is determined as follows:
-// Definitions:
-// lane_data_size = lane_data[0] × lane_data[1]
-// subgroup_size = lane_layout[0] × lane_layout[1]
-// distribution_unit_size = subgroup_size × lane_data_size
-// ---------------------------------------------------------------------
-// Case 1: Regular loads/stores.
-// ---------------------------------------------------------------------
-// The following conditions must be met:
-// * tensor_desc[0] == lane_layout[0]
-// Distributed vector is a 1D vector with shape:
-// [chunk_size]
-// ---------------------------------------------------------------------
-// Case 2: Block loads/stores
-// ---------------------------------------------------------------------
-// Additional definitions:
-// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
-// n_distribution_units = tensor_size / distribution_unit_size
-// fragment_size = n_distribution_units * lane_data_size
-// Given above definitions, the following conditions must be met:
-// * tensor_desc[0] % (lane_layout[0] × lane_data[0]) == 0
-// * tensor_desc[1] % (lane_layout[1] × lane_data[1]) == 0
-// Distributed vector is a 1D vector with shape:
-// [fragment_size]
-FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
- auto layout = llvm::dyn_cast_if_present<LayoutAttr>(getLayout());
- // It only works for subgroup level layout, which only has lane_layout
- // and lane_data, and is to distribute a SIMD code into SIMT code.
- if (!layout || !layout.isSgLayout())
- return failure();
-
- SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
- SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
- auto tdescShape = getShape();
-
- // compute sgSize by multiply elements of laneLayout
- // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
- // e.g. for 1D layout, sgSize = laneLayout[0]
- auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
- std::multiplies<int64_t>());
-
- // Case 1: regular loads/stores
- auto scatterAttr = getEncodingAsScatterTensorDescAttr();
- if (scatterAttr) {
- auto chunkSize = scatterAttr.getChunkSize().getInt();
- // Verify if the first dimension of the tensor descriptor shape is
- // distributable.
- assert(tdescShape[0] == laneLayout[0] &&
- "tensor descriptor shape is not distributable");
- return VectorType::get({chunkSize}, getElementType());
- }
-
- // Case 2: block loads/stores
- // Check if the tensor descriptor shape is distributable.
- int64_t tensorSize = 1;
- for (auto [tdescDim, laneDim, laneDataDim] :
- llvm::zip_equal(tdescShape, laneLayout, laneData)) {
- assert((tdescDim % (laneDim * laneDataDim) == 0) &&
- "tensor descriptor shape is not distributable");
- tensorSize *= tdescDim;
- }
- // tensorSize must be adjusted for array_length.
- tensorSize *= getArrayLength();
-
- return VectorType::get({tensorSize / sgSize}, getElementType());
-}
-
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 9f041aae511df..901e02d3c9cf5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -16,4 +16,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
MLIRPass
MLIRTransforms
MLIRGPUDialect
+ MLIRXeGPUUtils
+ MLIRGPUUtils
+ MLIRVectorTransforms
)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 721a815cf76b9..019032f7743bf 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -10,14 +10,33 @@
#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
#include "mlir/Analysis/DataFlowFramework.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/InterleavedRange.h"
#include "llvm/Support/raw_ostream.h"
@@ -28,27 +47,32 @@ namespace xegpu {
} // namespace xegpu
} // namespace mlir
+#define DEBUG_TYPE "xegpu-subgroup-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
using namespace mlir;
using namespace mlir::dataflow;
/// HW dependent constants.
/// TODO: These constants should be queried from the target information.
-constexpr unsigned subgroupSize = 16; // How many work items in a subgroup.
+constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup.
/// If DPAS A or B operands have low precision element types they must be packed
/// according to the following sizes.
constexpr unsigned packedSizeInBitsForDefault =
16; // Minimum packing size per register for DPAS A.
constexpr unsigned packedSizeInBitsForDpasB =
32; // Minimum packing size per register for DPAS B.
+static const char *const operandLayoutNamePrefix = "layout_operand_";
+static const char *const resultLayoutNamePrefix = "layout_result_";
namespace {
-///===----------------------------------------------------------------------===///
-/// Layout
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// Layout
+//===----------------------------------------------------------------------===//
-/// Helper class to store the ND layout of work items within a subgroup and data
-/// owned by each work item.
+/// Helper class to store the ND layout of lanes within a subgroup and data
+/// owned by each lane.
struct Layout {
SmallVector<int64_t, 3> layout;
Layout() = default;
@@ -67,235 +91,248 @@ int64_t Layout::operator[](size_t idx) const {
return layout[idx];
}
-/// WiLayout represents the layout of work items within a subgroup when it
-/// accesses some value. WiData represents the layout of data owned by each work
-/// item.
-using WiLayout = Layout;
-using WiData = Layout;
+/// LaneLayout represents the logical layout of lanes within a subgroup when it
+/// accesses some value. LaneData represents the logical layout of data owned by
+/// each work item.
+using LaneLayout = Layout;
+using LaneData = Layout;
-///===----------------------------------------------------------------------===///
-/// SGMap
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// LayoutInfo
+//===----------------------------------------------------------------------===//
-/// Helper class for tracking the analysis state of a value. For SGPropagation,
-/// the analysis state is simply the wi_layout and wi_data of each value.
-/// Purpose of this analysis to propagate some unique layout for each value in
-/// the program starting from some known values (like DPAS, StoreNd, etc.).
+/// Helper class for tracking the analysis state of an mlir value. For layout
+/// propagation, the analysis state is simply the lane_layout and lane_data of
+/// each value. Purpose of this analysis to propagate some unique layout for
+/// each value in the program starting from a set of anchor operations (like
+/// DPAS, StoreNd, etc.).
///
-/// Given this, SGMap satisifies the following properties:
-/// 1) SGMap is a lattice with two states - assigned and not assigned.
-/// 2) Two SGMap values are equal if they are both assigned or both not
-/// assigned. The concrete value of assigned state does not matter.
+/// Given this, LayoutInfo satisifies the following properties:
+/// 1) A LayoutInfo value can be in one of two states - `assigned` or `not
+/// assigned`.
+/// 2) Two LayoutInfo values are equal if they are both assigned or
+/// both not assigned. The concrete value of assigned state does not matter.
/// 3) The meet operator works as follows:
/// - If current state is assigned, return the current state. (already
/// a unique layout is assigned. don't change it)
/// - Otherwise, return the other state.
-struct SGMap {
+struct LayoutInfo {
private:
- WiLayout wiLayout;
- WiData wiData;
+ LaneLayout laneLayout;
+ LaneData laneData;
public:
- SGMap() = default;
- SGMap(const WiLayout &layout, const WiData &data)
- : wiLayout(layout), wiData(data) {}
+ LayoutInfo() = default;
+ LayoutInfo(const LaneLayout &layout, const LaneData &data)
+ : laneLayout(layout), laneData(data) {}
- /// Two lattice values are equal if they have `some` layout. The actual
- /// content of the layout does not matter.
- bool operator==(const SGMap &other) const {
+ // Two lattice values are equal if they have `some` layout. The actual
+ // content of the layout does not matter.
+ bool operator==(const LayoutInfo &other) const {
return this->isAssigned() == other.isAssigned();
}
- static SGMap meet(const SGMap &lhs, const SGMap &rhs);
+ static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);
- static SGMap join(const SGMap &lhs, const SGMap &rhs);
+ static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);
void print(raw_ostream &os) const;
- bool isAssigned() const { return wiLayout.size() > 0 && wiData.size() > 0; }
+ bool isAssigned() const {
+ return laneLayout.size() > 0 && laneData.size() > 0;
+ }
- SGMap getTransposedLayout(ArrayRef<int64_t> permutation) const;
+ LayoutInfo getTransposedLayout(ArrayRef<int64_t> permutation) const;
- const WiLayout &getLayout() const { return wiLayout; }
- const WiData &getData() const { return wiData; }
+ const LaneLayout &getLayout() const { return laneLayout; }
+ const LaneData &getData() const { return laneData; }
+ ArrayRef<int64_t> getLayoutAsArrayRef() const { return laneLayout.layout; }
+ ArrayRef<int64_t> getDataAsArrayRef() const { return laneData.layout; }
};
-void SGMap::print(raw_ostream &os) const {
+void LayoutInfo::print(raw_ostream &os) const {
if (isAssigned()) {
- os << "wi_layout: ";
- wiLayout.print(os);
- os << ", wi_data: ";
- wiData.print(os);
+ os << "lane_layout: ";
+ laneLayout.print(os);
+ os << ", lane_data: ";
+ laneData.print(os);
} else
os << "Not assigned.";
}
-SGMap SGMap::meet(const SGMap &lhs, const SGMap &rhs) {
+LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
if (!lhs.isAssigned())
return rhs;
return lhs;
}
/// Since this is a backward analysis, join method is not used.
-SGMap SGMap::join(const SGMap &lhs, const SGMap &rhs) {
- llvm_unreachable("Join should not be triggered by SGMapPropagation.");
+LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
+ llvm_unreachable("Join should not be triggered by layout propagation.");
}
/// Get the transposed layout according to the given permutation.
-SGMap SGMap::getTransposedLayout(ArrayRef<int64_t> permutation) const {
+LayoutInfo
+LayoutInfo::getTransposedLayout(ArrayRef<int64_t> permutation) const {
if (!isAssigned())
return {};
- WiLayout newLayout;
- WiData newData;
- for (auto idx : permutation) {
- newLayout.layout.push_back(wiLayout.layout[idx]);
- newData.layout.push_back(wiData.layout[idx]);
+ LaneLayout newLayout;
+ LaneData newData;
+ for (int64_t idx : permutation) {
+ newLayout.layout.push_back(laneLayout.layout[idx]);
+ newData.layout.push_back(laneData.layout[idx]);
}
- return SGMap(newLayout, newData);
+ return LayoutInfo(newLayout, newData);
}
-///===----------------------------------------------------------------------===///
-/// SGMapLattice
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// LayoutInfoLattice
+//===----------------------------------------------------------------------===//
-/// Lattice holding the SGMap for each value.
-struct SGMapLattice : public Lattice<SGMap> {
- MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SGMapLattice)
+/// Lattice holding the LayoutInfo for each value.
+struct LayoutInfoLattice : public Lattice<LayoutInfo> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
using Lattice::Lattice;
};
/// Helper Functions to get default layouts. A `default layout` is a layout that
/// is assigned to a value when the layout is not fixed by some anchor operation
-/// (like DPAS). This is the natural layout work items are arranged in a
-/// subgroup.
+/// (like DPAS).
/// Helper Function to get the default layout for uniform values like constants.
-/// For 1D vector, wi_layout is [subgroupSize] and wi_data is [1].
-/// For 2D vector, wi_layout is [1, subgroupSize] and wi_data is [1, 1].
-static SGMap getDefaultSgMap(unsigned rank) {
+/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
+/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
+static LayoutInfo getDefaultLayoutInfo(unsigned rank) {
assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
if (rank == 1)
- return SGMap(WiLayout({subgroupSize}), WiData({1}));
- return SGMap(WiLayout({1, subgroupSize}), WiData({1, 1}));
+ return LayoutInfo(LaneLayout({subgroupSize}), LaneData({1}));
+ return LayoutInfo(LaneLayout({1, subgroupSize}), LaneData({1, 1}));
}
/// Helper to get the default layout for a vector type.
-static SGMap getDefaultSgMap(VectorType vectorTy) {
- /// Expecting a 1D or 2D vector.
+static LayoutInfo getDefaultLayoutInfo(VectorType vectorTy) {
+ // Expecting a 1D or 2D vector.
assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) &&
"Expected 1D or 2D vector.");
- /// Expecting int or float element type.
+ // Expecting int or float element type.
assert(vectorTy.getElementType().isIntOrFloat() &&
"Expected int or float element type.");
- /// If the rank is 1, then return default layout for 1D vector.
+ // If the rank is 1, then return default layout for 1D vector.
if (vectorTy.getRank() == 1)
- return getDefaultSgMap(1);
- /// Packing factor is determined by the element type bitwidth.
+ return getDefaultLayoutInfo(1);
+ // Packing factor is determined by the element type bitwidth.
int packingFactor = 1;
- auto bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
+ unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
if (bitwidth < packedSizeInBitsForDefault)
packingFactor = packedSizeInBitsForDefault / bitwidth;
- return SGMap(WiLayout({1, subgroupSize}), WiData({1, packingFactor}));
+ return LayoutInfo(LaneLayout({1, subgroupSize}),
+ LaneData({1, packingFactor}));
}
-/// Helper Function to get the expected layouts for DPAS operands. `wi_data` is
-/// set according to the following criteria:
+/// Helper Function to get the expected layouts for DPAS operands. `lane_data`
+/// is set according to the following criteria:
/// * For A operand, the data must be packed in minimum
/// `packedSizeInBitsForDefault`
/// * For B operand, the data must be packed in minimum
/// `packedSizeInBitsForDpasB`
-static SGMap getSGMapForDPASOperand(VectorType vectorTy, unsigned operandNum) {
- auto elementTy = vectorTy.getElementType();
+static LayoutInfo getLayoutInfoForDPASOperand(VectorType vectorTy,
+ unsigned operandNum) {
+ Type elementTy = vectorTy.getElementType();
assert(elementTy.isIntOrFloat() &&
"Expected int or float type in DPAS operands");
- WiLayout layout({1, subgroupSize});
- /// For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
- /// must have the VNNI format.
+ LaneLayout layout({1, subgroupSize});
+ // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and
+ // must have the VNNI format.
if (operandNum == 1 &&
elementTy.getIntOrFloatBitWidth() < packedSizeInBitsForDpasB) {
- WiData data(
+ LaneData data(
{packedSizeInBitsForDpasB / elementTy.getIntOrFloatBitWidth(), 1});
- return SGMap(layout, data);
+ return LayoutInfo(layout, data);
}
- /// Otherwise, return the default layout for the vector type.
- return getDefaultSgMap(vectorTy);
+ // Otherwise, return the default layout for the vector type.
+ return getDefaultLayoutInfo(vectorTy);
}
-///===----------------------------------------------------------------------===///
-/// SGMapPropagation
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// LayoutInfoPropagation
+//===----------------------------------------------------------------------===//
-/// Backward data flow analysis to propagate the wi_layout and wi_data of each
-/// value in the program. Currently, the layouts for operands DPAS, StoreNd, and
-/// StoreScatter are fixed (known before propagation). Purpose of this analysis
-/// is to propagate those known layouts to all their producers and (other)
-/// consumers.
-class SGMapPropagation : public SparseBackwardDataFlowAnalysis<SGMapLattice> {
+/// Backward data flow analysis to propagate the lane_layout and lane_data of
+/// each value in the program. Currently, the layouts for operands DPAS,
+/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
+/// this analysis is to propagate those known layouts to all their producers and
+/// (other) consumers.
+class LayoutInfoPropagation
+ : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
private:
- void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
- void visitStoreNdOp(xegpu::StoreNdOp store, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ void visitStoreNdOp(xegpu::StoreNdOp store,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
- void visitLoadNdOp(xegpu::LoadNdOp load, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ void visitLoadNdOp(xegpu::LoadNdOp load,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitLoadGatherOp(xegpu::LoadGatherOp load,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitTransposeOp(vector::TransposeOp transpose,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitVectorBitcastOp(vector::BitCastOp bitcast,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitCreateDescOp(xegpu::CreateDescOp createDesc,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results);
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results);
public:
- SGMapPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable)
+ LayoutInfoPropagation(DataFlowSolver &solver,
+ SymbolTableCollection &symbolTable)
: SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
- LogicalResult visitOperation(Operation *op, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) override;
+ LogicalResult
+ visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) override;
void visitBranchOperand(OpOperand &operand) override {};
void visitCallOperand(OpOperand &operand) override {};
void visitExternalCall(CallOpInterface call,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) override {};
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) override {
+ };
- void setToExitState(SGMapLattice *lattice) override {
- (void)lattice->meet(SGMap());
+ void setToExitState(LayoutInfoLattice *lattice) override {
+ (void)lattice->meet(LayoutInfo());
}
};
} // namespace
-LogicalResult
-SGMapPropagation::visitOperation(Operation *op,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
+LogicalResult LayoutInfoPropagation::visitOperation(
+ Operation *op, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
TypeSwitch<Operation *>(op)
.Case<xegpu::DpasOp>(
[&](auto dpasOp) { visitDpasOp(dpasOp, operands, results); })
@@ -315,8 +352,8 @@ SGMapPropagation::visitOperation(Operation *op,
.Case<xegpu::UpdateNdOffsetOp>([&](auto updateNdOffsetOp) {
visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
})
- /// No need to propagate the layout to operands in CreateNdDescOp because
- /// they are scalars (offsets, sizes, etc.).
+ // No need to propagate the layout to operands in CreateNdDescOp because
+ // they are scalars (offsets, sizes, etc.).
.Case<xegpu::CreateNdDescOp>([&](auto createNdDescOp) {})
.Case<vector::TransposeOp>([&](auto transposeOp) {
visitTransposeOp(transposeOp, operands, results);
@@ -327,245 +364,251 @@ SGMapPropagation::visitOperation(Operation *op,
.Case<vector::MultiDimReductionOp>([&](auto reductionOp) {
visitVectorMultiReductionOp(reductionOp, operands, results);
})
- /// All other ops.
+ // All other ops.
.Default([&](Operation *op) {
- for (const SGMapLattice *r : results) {
- for (SGMapLattice *operand : operands) {
- /// Propagate the layout of the result to the operand.
+ for (const LayoutInfoLattice *r : results) {
+ for (LayoutInfoLattice *operand : operands) {
+ // Propagate the layout of the result to the operand.
if (r->getValue().isAssigned())
meet(operand, *r);
}
}
});
- /// Add a dependency from each result to program point after the operation.
- for (const SGMapLattice *r : results) {
- addDependency(const_cast<SGMapLattice *>(r), getProgramPointAfter(op));
+ // Add a dependency from each result to program point after the operation.
+ for (const LayoutInfoLattice *r : results) {
+ addDependency(const_cast<LayoutInfoLattice *>(r), getProgramPointAfter(op));
}
return success();
}
-void SGMapPropagation::visitVectorMultiReductionOp(
- vector::MultiDimReductionOp reduction, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- /// The layout of the result must be present.
- auto resultLayout = results[0]->getValue();
+void LayoutInfoPropagation::visitVectorMultiReductionOp(
+ vector::MultiDimReductionOp reduction,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // The layout of the result must be present.
+ LayoutInfo resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
return;
- /// We only consider 2D -> 1D reductions at this point.
+ // We only consider 2D -> 1D reductions at this point.
assert(resultLayout.getLayout().size() == 1 &&
"Expected 1D layout for reduction result.");
- /// Given that the result is 1D, the layout of the operand should be 2D with
- /// default layout.
- auto operandLayout = getDefaultSgMap(2);
+ // Given that the result is 1D, the layout of the operand should be 2D with
+ // default layout.
+ LayoutInfo operandLayout = getDefaultLayoutInfo(2);
propagateIfChanged(operands[0], operands[0]->meet(operandLayout));
- /// Accumulator should have the same layout as the result.
+ // Accumulator should have the same layout as the result.
propagateIfChanged(operands[1], operands[1]->meet(resultLayout));
}
/// Propagate the layout of the result tensor to the source tensor descriptor in
/// UpdateNdOffsetOp.
-void SGMapPropagation::visitUpdateNdOffsetOp(
- xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- /// The layout of the result must be present.
- auto resultLayout = results[0]->getValue();
+void LayoutInfoPropagation::visitUpdateNdOffsetOp(
+ xegpu::UpdateNdOffsetOp updateNdOffset,
+ ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // The layout of the result must be present.
+ LayoutInfo resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
return;
- /// Propagate the layout to the source operand.
+ // Propagate the layout to the source operand.
propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
}
/// Set the layouts for DPAS A, B, and C operands.
-void SGMapPropagation::visitDpasOp(xegpu::DpasOp dpas,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- auto aTy = dpas.getLhsType();
- auto bTy = dpas.getRhsType();
+void LayoutInfoPropagation::visitDpasOp(
+ xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ VectorType aTy = dpas.getLhsType();
+ VectorType bTy = dpas.getRhsType();
propagateIfChanged(operands[0],
- operands[0]->meet(getSGMapForDPASOperand(aTy, 0)));
+ operands[0]->meet(getLayoutInfoForDPASOperand(aTy, 0)));
propagateIfChanged(operands[1],
- operands[1]->meet(getSGMapForDPASOperand(bTy, 1)));
+ operands[1]->meet(getLayoutInfoForDPASOperand(bTy, 1)));
if (operands.size() > 2) {
- auto cTy = dpas.getAccType();
+ VectorType cTy = dpas.getAccType();
propagateIfChanged(operands[2],
- operands[2]->meet(getSGMapForDPASOperand(cTy, 2)));
+ operands[2]->meet(getLayoutInfoForDPASOperand(cTy, 2)));
}
}
/// Set the layout for the value and tensor descriptor operands in StoreNdOp.
-void SGMapPropagation::visitStoreNdOp(xegpu::StoreNdOp store,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- auto storeLayout = getDefaultSgMap(store.getValueType());
- /// Both operands should have the same layout
- for (SGMapLattice *operand : operands) {
+void LayoutInfoPropagation::visitStoreNdOp(
+ xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ LayoutInfo storeLayout = getDefaultLayoutInfo(store.getValueType());
+ // Both operands should have the same layout
+ for (LayoutInfoLattice *operand : operands) {
propagateIfChanged(operand, operand->meet(storeLayout));
}
}
/// Propagate the layout of the value to the tensor descriptor operand in
/// LoadNdOp.
-void SGMapPropagation::visitLoadNdOp(xegpu::LoadNdOp load,
- ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- auto valueLayout = results[0]->getValue();
- /// Need the layout of the value to propagate to the tensor descriptor.
+void LayoutInfoPropagation::visitLoadNdOp(
+ xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ LayoutInfo valueLayout = results[0]->getValue();
+ // Need the layout of the value to propagate to the tensor descriptor.
if (!valueLayout.isAssigned())
return;
- SGMap tensorDescLayout = valueLayout;
- /// LoadNdOp has the transpose effect. However, at the stage of this analysis
- /// this effect is not expected and should be abstracted away. Emit a warning.
+ LayoutInfo tensorDescLayout = valueLayout;
+ // LoadNdOp has the transpose effect. However, at the stage of this analysis
+ // this effect is not expected and should be abstracted away. Emit a warning.
if (auto transpose = load.getTranspose()) {
load.emitWarning("Transpose effect is not expected for LoadNdOp at "
- "SGMapPropagation stage.");
+ "LayoutInfoPropagation stage.");
tensorDescLayout = valueLayout.getTransposedLayout(transpose.value());
}
- /// Propagate the new layout to the tensor descriptor operand.
+ // Propagate the new layout to the tensor descriptor operand.
propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
}
/// For vector::TransposeOp, the layout of the result is transposed and
/// propagated to the operand.
-void SGMapPropagation::visitTransposeOp(
- vector::TransposeOp transpose, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- /// Need the layout of transpose result to propagate to the operands.
- auto resultLayout = results[0]->getValue();
+void LayoutInfoPropagation::visitTransposeOp(
+ vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // Need the layout of transpose result to propagate to the operands.
+ LayoutInfo resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
return;
- auto newLayout = resultLayout.getTransposedLayout(transpose.getPermutation());
- /// Propagate the new layout to the vector operand.
+ LayoutInfo newLayout =
+ resultLayout.getTransposedLayout(transpose.getPermutation());
+ // Propagate the new layout to the vector operand.
propagateIfChanged(operands[0], operands[0]->meet(newLayout));
}
-/// For vector::BitCastOp, the wi_data of the source layout is changed based on
-/// the bit width of the source and result types.
-void SGMapPropagation::visitVectorBitcastOp(
- vector::BitCastOp bitcast, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- /// Need the layout of bitcast result to propagate to the operands.
- auto resultLayout = results[0]->getValue();
+/// For vector::BitCastOp, the lane_data of the source layout is changed based
+/// on the bit width of the source and result types.
+void LayoutInfoPropagation::visitVectorBitcastOp(
+ vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // Need the layout of bitcast result to propagate to the operands.
+ LayoutInfo resultLayout = results[0]->getValue();
if (!resultLayout.isAssigned())
return;
- auto inElemTyBitWidth =
+ int inElemTyBitWidth =
bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth();
- auto outElemTyBitWidth =
+ int outElemTyBitWidth =
bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth();
- /// WiLayout does not change.
- const WiLayout &newWiLayout = resultLayout.getLayout();
- const WiData &currData = resultLayout.getData();
- WiData newWiData;
- /// It's a widening bitcast
+ // LaneLayout does not change.
+ const LaneLayout &newLaneLayout = resultLayout.getLayout();
+ const LaneData &currData = resultLayout.getData();
+ LaneData newLaneData;
+ // It's a widening bitcast
if (inElemTyBitWidth < outElemTyBitWidth) {
- auto ratio = outElemTyBitWidth / inElemTyBitWidth;
- newWiData = resultLayout.getData()[0] == 1
- ? WiData({1, currData[1] * ratio})
- : WiData({currData[0] * ratio, 1});
+ int ratio = outElemTyBitWidth / inElemTyBitWidth;
+ newLaneData = resultLayout.getData()[0] == 1
+ ? LaneData({1, currData[1] * ratio})
+ : LaneData({currData[0] * ratio, 1});
} else {
- /// It's a narrowing bitcast
- auto ratio = inElemTyBitWidth / outElemTyBitWidth;
- newWiData = resultLayout.getData()[0] == 1
- ? WiData({1, currData[1] / ratio})
- : WiData({currData[0] / ratio, 1});
+ // It's a narrowing bitcast
+ int ratio = inElemTyBitWidth / outElemTyBitWidth;
+ newLaneData = resultLayout.getData()[0] == 1
+ ? LaneData({1, currData[1] / ratio})
+ : LaneData({currData[0] / ratio, 1});
}
propagateIfChanged(operands[0],
- operands[0]->meet(SGMap(newWiLayout, newWiData)));
+ operands[0]->meet(LayoutInfo(newLaneLayout, newLaneData)));
}
/// Propagate the layout of the result to the tensor descriptor and mask
/// operands in LoadGatherOp.
-void SGMapPropagation::visitLoadGatherOp(
- xegpu::LoadGatherOp load, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- auto valueLayout = results[0]->getValue();
- /// Need the layout of the value to propagate to the tensor descriptor.
+void LayoutInfoPropagation::visitLoadGatherOp(
+ xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ LayoutInfo valueLayout = results[0]->getValue();
+ // Need the layout of the value to propagate to the tensor descriptor.
if (!valueLayout.isAssigned())
return;
- SGMap tensorDescLayout = valueLayout;
+ LayoutInfo tensorDescLayout = valueLayout;
if (load.getTranspose()) {
- /// LoadGatherOp has the transpose effect. However, at the stage of this
- /// analyis this effect is not expected and should be abstracted away. Emit
- /// a warning.
+ // LoadGatherOp has the transpose effect. However, at the stage of this
+ // analyis this effect is not expected and should be abstracted away. Emit
+ // a warning.
load.emitWarning("Transpose effect is not expected for LoadGatherOp at "
- "SGMapPropagation stage.");
+ "LayoutInfoPropagation stage.");
tensorDescLayout = valueLayout.getTransposedLayout({1, 0});
}
- /// Mask operand should have 1D default layout.
- auto maskLayout = getDefaultSgMap(1);
- /// Propagate the new layout to the tensor descriptor operand.
+ // Mask operand should have 1D default layout.
+ LayoutInfo maskLayout = getDefaultLayoutInfo(1);
+ // Propagate the new layout to the tensor descriptor operand.
propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout));
- /// Propagate the new layout to the mask operand.
+ // Propagate the new layout to the mask operand.
propagateIfChanged(operands[1], operands[1]->meet(maskLayout));
}
/// Propagate the layout of the descriptor to the vector offset operand in
/// CreateDescOp.
-void SGMapPropagation::visitCreateDescOp(
- xegpu::CreateDescOp createDesc, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- auto descLayout = results[0]->getValue();
- /// Need the layout of the descriptor to propagate to the operands.
+void LayoutInfoPropagation::visitCreateDescOp(
+ xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ LayoutInfo descLayout = results[0]->getValue();
+ // Need the layout of the descriptor to propagate to the operands.
if (!descLayout.isAssigned())
return;
- /// For offset operand propagate 1D default layout.
- SGMap layout = getDefaultSgMap(1);
+ // For offset operand propagate 1D default layout.
+ LayoutInfo layout = getDefaultLayoutInfo(1);
propagateIfChanged(operands[1], operands[1]->meet(layout));
}
/// Set the layout for the value, tensor descriptor, and mask operands in the
/// StoreScatterOp.
-void SGMapPropagation::visitStoreScatterOp(
- xegpu::StoreScatterOp storeScatter, ArrayRef<SGMapLattice *> operands,
- ArrayRef<const SGMapLattice *> results) {
- /// Currently, for 2D StoreScatterOp we expect that the height dimension of
- /// the tensor descriptor is evenly divisible by the subgroup size.
- /// TODO: Add support for other 2D shapes.
- auto tdescShape = storeScatter.getTensorDescType().getShape();
- if (tdescShape.size() > 1 && tdescShape[0] % subgroupSize != 0) {
- storeScatter.emitError("Height dimension of the tensor descriptor should "
- "be evenly divisible by the subgroup size.");
- return;
- }
- auto valueLayout = getDefaultSgMap(storeScatter.getValueType());
- SGMap storeScatterLayout = valueLayout;
+void LayoutInfoPropagation::visitStoreScatterOp(
+ xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
+ ArrayRef<const LayoutInfoLattice *> results) {
+ // Currently, for 2D StoreScatterOp we expect that the height dimension of
+ // the tensor descriptor is equal to the subgroup size. This is ensured by
+ // the op verifier.
+ ArrayRef<int64_t> tdescShape = storeScatter.getTensorDescType().getShape();
+ if (tdescShape.size() > 1)
+ assert(
+ tdescShape[0] == subgroupSize &&
+ "Expected the first dimension of 2D tensor descriptor to be equal to "
+ "subgroup size.");
+
+ LayoutInfo valueLayout = getDefaultLayoutInfo(storeScatter.getValueType());
+ LayoutInfo storeScatterLayout = valueLayout;
if (storeScatter.getTranspose()) {
- /// StoreScatteOp allows transpose effect. However, at the stage of this
- /// analyis this effect is not expected and should be abstracted away. Emit
- /// a warning.
+ // StoreScatteOp allows transpose effect. However, at the stage of this
+ // analyis this effect is not expected and should be abstracted away. Emit
+ // a warning.
storeScatter.emitWarning("Transpose effect is not expected for "
- "StoreScatterOp at SGMapPropagation stage.");
+ "StoreScatterOp at LayoutInfoPropagation stage.");
storeScatterLayout = valueLayout.getTransposedLayout({1, 0});
}
- /// Propagate the value layout.
+ // Propagate the value layout.
propagateIfChanged(operands[0], operands[0]->meet(valueLayout));
- /// Propagate the tensor descriptor layout.
+ // Propagate the tensor descriptor layout.
propagateIfChanged(operands[1], operands[1]->meet(storeScatterLayout));
- /// Use default 1D layout for mask operand.
- auto maskLayout = getDefaultSgMap(1);
+ // Use default 1D layout for mask operand.
+ LayoutInfo maskLayout = getDefaultLayoutInfo(1);
propagateIfChanged(operands[2], operands[2]->meet(maskLayout));
}
namespace {
-///===----------------------------------------------------------------------===///
-/// RunSGMapPropagation
-///===----------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
+// RunLayoutInfoPropagation
+//===----------------------------------------------------------------------===//
-/// Driver class for running the SGMapPropagation analysis.
-class RunSGMapPropagation {
+/// Driver class for running the LayoutInfoPropagation analysis.
+class RunLayoutInfoPropagation {
public:
- RunSGMapPropagation(Operation *op) : target(op) {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
+
+ RunLayoutInfoPropagation(Operation *op) : target(op) {
SymbolTableCollection symbolTable;
solver.load<DeadCodeAnalysis>();
solver.load<SparseConstantPropagation>();
- solver.load<SGMapPropagation>(symbolTable);
+ solver.load<LayoutInfoPropagation>(symbolTable);
(void)solver.initializeAndRun(op);
}
- SGMap getSGMap(Value val);
+ LayoutInfo getLayoutInfo(Value val);
void printAnalysisResult(llvm::raw_ostream &os);
@@ -575,21 +618,21 @@ class RunSGMapPropagation {
};
} // namespace
-SGMap RunSGMapPropagation::getSGMap(Value val) {
- auto *state = solver.lookupState<SGMapLattice>(val);
+LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
+ auto *state = solver.lookupState<LayoutInfoLattice>(val);
if (!state)
return {};
return state->getValue();
}
-void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
+void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
auto printFunctionResult = [&](FunctionOpInterface funcOp) {
os << "function: " << funcOp.getName() << ":\n";
// Function arguments
- for (auto arg : funcOp.getArguments()) {
- auto layout = getSGMap(arg);
+ for (BlockArgument arg : funcOp.getArguments()) {
+ LayoutInfo layout = getLayoutInfo(arg);
os << "argument: " << arg << "\n";
- os << "sg_map : ";
+ os << "layout : ";
layout.print(os);
os << "\n";
}
@@ -599,16 +642,16 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
if (op->getResults().empty())
return;
os << "op : ";
- /// For control-flow ops, print the op name only.
+ // For control-flow ops, print the op name only.
if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op))
os << op->getName();
else
op->print(os);
os << "\n";
- /// Print the sg_map for each result.
+ // Print the layout for each result.
for (auto [i, r] : llvm::enumerate(op->getResults())) {
- auto layout = getSGMap(r);
- os << "sg_map for result #" << i << ": ";
+ LayoutInfo layout = getLayoutInfo(r);
+ os << "layout for result #" << i << ": ";
layout.print(os);
os << "\n";
}
@@ -620,19 +663,757 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
for (auto funcOp : modOp.getOps<FunctionOpInterface>()) {
funcOps.push_back(funcOp);
}
- /// Collect all GpuFuncOps in the module.
+ // Collect all GpuFuncOps in the module.
for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>()) {
funcOps.push_back(gpuFuncOp);
}
}
}
- /// Print the analysis result for each function.
- for (auto funcOp : funcOps) {
+ // Print the analysis result for each function.
+ for (FunctionOpInterface funcOp : funcOps) {
printFunctionResult(funcOp);
}
}
+namespace {
+
+//===----------------------------------------------------------------------===//
+// LayoutAttrAssignment
+//===----------------------------------------------------------------------===//
+
+/// This class is responsible for assigning the layout attributes to the ops and
+/// their users based on the layout propagation analysis result.
+class LayoutAttrAssignment {
+public:
+ LayoutAttrAssignment(Operation *top,
+ function_ref<LayoutInfo(Value)> getLayout)
+ : getAnalysisResult(getLayout), top(top) {}
+
+ LogicalResult run();
+
+private:
+ LogicalResult assign(Operation *op);
+ void assignToUsers(Value v, xegpu::LayoutAttr layout);
+ xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+ LogicalResult resolveConflicts();
+ // Callable to get the layout of a value based on the layout propagation
+ // analysis.
+ function_ref<LayoutInfo(Value)> getAnalysisResult;
+ Operation *top;
+};
+
+} // namespace
+
+/// Helper to assign the layout attribute to the users of the value.
+void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+ for (OpOperand &user : v.getUses()) {
+ Operation *owner = user.getOwner();
+ unsigned operandNumber = user.getOperandNumber();
+ // Use a generic name for ease of querying the layout attribute later.
+ std::string attrName =
+ operandLayoutNamePrefix + std::to_string(operandNumber);
+ owner->setAttr(attrName, layout);
+ }
+}
+
+/// Convert the layout assigned to a value to xegpu::LayoutAttr.
+xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+ LayoutInfo layout = getAnalysisResult(v);
+ if (!layout.isAssigned())
+ return {};
+ SmallVector<int, 2> laneLayout, laneData;
+ for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+ layout.getDataAsArrayRef())) {
+ laneLayout.push_back(static_cast<int>(layout));
+ laneData.push_back(static_cast<int>(data));
+ }
+ return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+}
+
+/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+/// based on the layout propagation analysis result.
+LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+ // For function ops, propagate the function argument layout to the users.
+ if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+ for (BlockArgument arg : func.getArguments()) {
+ xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg);
+ if (layoutInfo) {
+ assignToUsers(arg, layoutInfo);
+ }
+ }
+ return success();
+ }
+ // If no results, move on.
+ if (op->getNumResults() == 0)
+ return success();
+ // If all the results are scalars, move on.
+ if (llvm::all_of(op->getResultTypes(),
+ [](Type t) { return t.isIntOrIndexOrFloat(); }))
+ return success();
+ // If the op has more than one result and at least one result is a tensor
+ // descriptor, exit. This case is not supported yet.
+ // TODO: Support this case.
+ if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type t) {
+ return isa<xegpu::TensorDescType>(t);
+ })) {
+ LLVM_DEBUG(
+ DBGS() << op->getName()
+ << " op has more than one result and at least one is a tensor "
+ "descriptor. This case is not handled.\n");
+ return failure();
+ }
+ // If the result is a tensor descriptor, attach the layout to the tensor
+ // descriptor itself.
+ if (auto tensorDescTy =
+ dyn_cast<xegpu::TensorDescType>(op->getResultTypes()[0])) {
+ xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0));
+ if (!layoutInfo) {
+ LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+ return failure();
+ }
+
+ // Clone the op, attach the layout to the result tensor descriptor, and
+ // remove the original op.
+ OpBuilder builder(op);
+ Operation *newOp = builder.clone(*op);
+ auto newTensorDescTy = xegpu::TensorDescType::get(
+ tensorDescTy.getContext(), tensorDescTy.getShape(),
+ tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
+ newOp->getResult(0).setType(newTensorDescTy);
+ op->replaceAllUsesWith(newOp->getResults());
+ op->erase();
+ return success();
+ }
+ // Otherwise simply attach the layout to the op itself.
+ for (auto [i, r] : llvm::enumerate(op->getResults())) {
+ xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r);
+ if (layoutInfo) {
+ std::string attrName = resultLayoutNamePrefix + std::to_string(i);
+ op->setAttr(attrName, layoutInfo);
+ // Attach the layout attribute to the users of the result.
+ assignToUsers(r, layoutInfo);
+ }
+ }
+ return success();
+}
+
+/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+LogicalResult LayoutAttrAssignment::run() {
+ auto walkResult = top->walk([&](Operation *op) {
+ if (failed(assign(op)))
+ return WalkResult::interrupt();
+ return WalkResult::advance();
+ });
+
+ if (walkResult.wasInterrupted())
+ return failure();
+
+ return resolveConflicts();
+}
+
+/// TODO: Implement the layout conflict resolution. This must ensure mainly two
+/// things:
+/// 1) Is a given layout supported by the op? (need to query the target
+/// HW info). Otherwise can we achive this layout using a layout conversion?
+/// 2) Do all the operands have the required layout? If not, can it
+/// be resolved using a layout conversion?
+LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// SIMT Distribution Patterns
+//===----------------------------------------------------------------------===//
+
+/// Helper function to get distributed vector type for a source vector type
+/// according to the lane_layout. We simply divide each dimension of tensor
+/// descriptor shape by corresponding lane_layout dimension. If
+/// array_length > 1, that is appended to the front of the ditributed shape.
+/// NOTE: This is the vector type that will be returned by the
+/// gpu.warp_execute_on_lane0 op.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16 | [1, 16] | 32x1 |
+/// | 32x16 | [2, 8] | 16x2 |
+/// | 2x32x16 | [1, 16] | 2x32x1 |
+static FailureOr<VectorType>
+getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+ VectorType originalType) {
+ if (!layout)
+ return failure();
+
+ auto laneLayout = layout.getLaneLayout().asArrayRef();
+ assert(originalType.getShape().size() >= laneLayout.size() &&
+ "Rank of the original vector type should be greater or equal to the "
+ "size of the lane layout to distribute the vector type.");
+ SmallVector<int64_t> distributedShape(originalType.getShape());
+ // Only distribute the last `laneLayout.size()` dimensions. The remaining
+ // dimensions are not distributed.
+ unsigned distributionStart = originalType.getRank() - laneLayout.size();
+ for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+ if (i < distributionStart) {
+ continue;
+ }
+ // Check if the dimension can be distributed evenly.
+ if (dim % laneLayout[i - distributionStart] != 0)
+ return failure();
+ distributedShape[i] = dim / laneLayout[i - distributionStart];
+ }
+ return VectorType::get(distributedShape, originalType.getElementType());
+}
+
+// Drop the layout attribute from the tensor descriptor type if layout is
+// present.
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+ if (tensorDesc.getLayoutAttr() == xegpu::LayoutAttr())
+ return tensorDesc;
+
+ return xegpu::TensorDescType::get(
+ tensorDesc.getContext(), tensorDesc.getShape(),
+ tensorDesc.getElementType(), tensorDesc.getEncoding(),
+ xegpu::LayoutAttr());
+}
+
+/// Helper function to resolve types if the distributed type out of
+/// gpu.warp_execute_on_lane0 is
diff erent from the expected xegpu SIMT type.
+/// Example 1:
+/// distributed type: vector<8x1xf32>
+/// expected type: vector<8xf32>
+/// resolved using,
+/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
+/// Example 2:
+/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
+/// expected type: xegpu.tensor_desc<8x16xf32>
+/// resolved using,
+/// %0 = unrealized_conversion_cast %1 :
+/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
+/// xegpu.tensor_desc<8x16xf32>
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+ PatternRewriter &rewriter) {
+ // If orig and expected types are the same, return orig.
+ if (orig.getType() == expected)
+ return orig;
+ // If orig is a vector type, create a shape cast op to reconcile the types.
+ if (auto origVecType = isa<VectorType>(orig.getType())) {
+ auto castOp =
+ rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+ return castOp.getResult();
+ }
+ // If orig is a tensor descriptor type, create an unrealized conversion cast
+ // op to reconcile the types.
+ if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+ auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+ expected, orig);
+ return castOp.getResult(0);
+ }
+ llvm_unreachable("Unsupported type for reconciliation");
+ return orig;
+}
+
+/// Helper function to filter out the temporary layout attributes attached
+/// during the layout assignment process. These are not needed after going to
+/// SIMT.
+static SmallVector<NamedAttribute>
+removeTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+ SmallVector<NamedAttribute> newAttrs;
+ for (NamedAttribute attr : attrs) {
+ if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+ attr.getName().strref().contains(resultLayoutNamePrefix)) {
+ continue;
+ }
+ newAttrs.push_back(attr);
+ }
+ return newAttrs;
+}
+
+/// Helper function to check if the layout is packed. Layout is packed if it is
+/// 2D and lane_data[0] != 1 (data packed from col dimension).
+static bool hasPackedLayout(xegpu::LayoutAttr layout) {
+ if (layout == xegpu::LayoutAttr())
+ return false;
+ DenseI32ArrayAttr laneData = layout.getLaneData();
+ if (!laneData || laneData.size() != 2)
+ return false;
+ return laneData.asArrayRef()[0] != 1;
+}
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+/// ...
+/// ...
+/// gpu.return %result: vector<8x16xf32>
+/// }
+/// ```
+/// To
+/// ```
+/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+/// %laneid = gpu.lane_id : index
+/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+/// ...
+/// ...
+/// gpu.yield %result: vector<8x16xf32>
+/// }
+/// return %0
+/// }
+struct MoveFuncBodyToWarpExecuteOnLane0
+ : public OpRewritePattern<gpu::GPUFuncOp> {
+ using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+ LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+ PatternRewriter &rewriter) const override {
+ // If the function only contains a single void return, skip.
+ if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+ }))
+ return failure();
+ // If the function already moved inside a warp_execute_on_lane0, skip.
+ if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+ return isa<gpu::WarpExecuteOnLane0Op>(op);
+ }))
+ return failure();
+ // Create a new function with the same signature.
+ auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+ gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+ // Create a WarpExecuteOnLane0Op with same arguments and results as the
+ // original gpuFuncOp.
+ rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+ auto laneId = rewriter.create<gpu::LaneIdOp>(
+ newGpuFunc.getLoc(), rewriter.getIndexType(),
+ /** upperBound = **/ mlir::IntegerAttr());
+ ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+ auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+ laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+ newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+ Block &warpBodyBlock = warpOp.getBodyRegion().front();
+ // Replace the ReturnOp of the original gpu function with a YieldOp.
+ auto origRetunOp =
+ cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+ rewriter.setInsertionPointAfter(origRetunOp);
+ rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+ origRetunOp.getOperands());
+ rewriter.eraseOp(origRetunOp);
+ // Move the original function body to the WarpExecuteOnLane0Op body.
+ rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+ warpOp.getBodyRegion().begin());
+ rewriter.eraseBlock(&warpBodyBlock);
+ // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+ rewriter.setInsertionPointAfter(warpOp);
+ rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+ rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+ return success();
+ }
+};
+
+/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
+/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
+/// arguments. Tensor descriptor shape is not distributed because it is a
+/// uniform value across all work items within the subgroup. However, the
+/// layout information is dropped in the new tensor descriptor type.
+///
+/// Example:
+///
+/// ```
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// ...
+/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+/// vector.yield %td
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
+/// ...
+/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #lo0>
+/// vector.yield %arg0, %dead
+/// }
+/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+/// -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+ auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+
+ xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+ if (!layout)
+ return rewriter.notifyMatchFailure(
+ descOp, "the tensor descriptor lacks layout attribute");
+
+ SmallVector<size_t> newRetIndices;
+ SmallVector<Value> newYieldValues;
+ SmallVector<Type> newYieldTypes;
+
+ for (Value operand : descOp->getOperands()) {
+ newYieldValues.push_back(operand);
+ newYieldTypes.push_back(operand.getType());
+ }
+ rewriter.setInsertionPoint(subgroupOp);
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+ /* new yielded types = */ newYieldTypes, newRetIndices);
+
+ SmallVector<Value> newDescOperands;
+ for (size_t i : newRetIndices) {
+ newDescOperands.push_back(newWarpOp.getResult(i));
+ }
+ rewriter.setInsertionPointAfter(newWarpOp);
+ xegpu::TensorDescType distributedTensorDescTy =
+ dropLayouts(descOp.getType()); // Distributed tensor descriptor type
+ // does not contain layout info.
+ auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+ newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
+ descOp->getAttrs());
+
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+ return success();
+ }
+};
+
+/// Distribute a store_nd op at the end of enclosing
+/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
+/// through the warp op interface they would be propagated as returned values.
+/// Source vector is distributed based on lane layout. Appropriate cast ops are
+/// inserted if the distributed types does not match expected xegpu SIMT types.
+///
+/// Example:
+///
+/// ```
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #lo0>
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
+/// #lo0>
+/// }
+/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
+/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+/// #lo0>
+/// -> !xegpu.tensor_desc<4x8xf32>
+/// xegpu.store_nd %0, %1: vector<4xf32>,
+/// !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ auto yield = cast<gpu::YieldOp>(
+ subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+ if (!storeOp)
+ return failure();
+
+ xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
+ xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+ if (!layout)
+ return rewriter.notifyMatchFailure(
+ storeOp, "the source tensor descriptor lacks layout attribute");
+
+ FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
+ if (failed(distributedTypeByWarpOpOrFailure))
+ return rewriter.notifyMatchFailure(storeOp,
+ "Failed to distribute the type");
+ VectorType distributedTypeByWarpOp =
+ distributedTypeByWarpOpOrFailure.value();
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp,
+ /* new yielded values = */
+ ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
+ /* new yielded types = */
+ TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+ newRetIndices);
+ // Create a new store op outside the warp op with the distributed vector
+ // type. Tensor descriptor is not distributed.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ SmallVector<Value> newStoreOperands;
+
+ // For the value operand, there can be a mismatch between the vector type
+ // distributed by the warp op and (xegpu-specific) distributed type
+ // supported by the store op. Type mismatch must be resolved using
+ // appropriate cast op.
+ FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
+ xegpu::getDistributedVectorType(storeOp.getTensorDescType());
+ if (failed(storeNdDistributedValueTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ storeOp, "Failed to get distributed vector type for the store op");
+ newStoreOperands.push_back(resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[0]),
+ storeNdDistributedValueTyOrFailure.value(), rewriter));
+ // For the tensor descriptor operand, the layout attibute is dropped after
+ // distribution. Types needs to be resolved in this case also.
+ xegpu::TensorDescType distributedTensorDescTy =
+ dropLayouts(storeOp.getTensorDescType());
+ newStoreOperands.push_back(
+ resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
+ distributedTensorDescTy, rewriter));
+
+ rewriter.create<xegpu::StoreNdOp>(
+ newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
+ removeTemporaryLayoutAttributes(storeOp->getAttrs()));
+ rewriter.eraseOp(storeOp);
+ return success();
+ }
+};
+
+/// Distribute a load_nd op feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later). The yield op will
+/// bypass the load's arguments. Only the loaded vector is distributed
+/// according to lane layout and, tensor descriptor types is not
+/// distributed. Appropriate cast ops are inserted if the distributed types does
+/// not match expected xegpu SIMT types.
+///
+/// Example:
+///
+/// ```
+/// #lo0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (vector<4x1xf32>) {
+/// ...
+/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #lo0> ->
+/// vector<4x8xf32>
+/// gpu.yield %ld
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #lo0>) {
+/// ...
+/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #lo0> ->
+/// vector<4x8xf32> gpu.yield %dead, %arg0
+/// }
+/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
+/// #lo0> -> !xegpu.tensor_desc<4x8xf32>
+/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
+/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
+///
+/// ```
+struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ subgroupOp, "warp result is not a xegpu::LoadNd op");
+
+ auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+ xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+ xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+ if (!layout)
+ return rewriter.notifyMatchFailure(
+ loadOp, "the source tensor descriptor lacks layout attribute");
+
+ unsigned operandIdx = operand->getOperandNumber();
+ VectorType distributedTypeByWarpOp =
+ cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp,
+ /* new yielded values = */ loadOp.getTensorDesc(),
+ /* new yielded types = */ tensorDescTy, newRetIndices);
+
+ // Create a new load op outside the warp op with the distributed vector
+ // type.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ FailureOr<VectorType> loadNdDistValueTyOrFailure =
+ xegpu::getDistributedVectorType(loadOp.getTensorDescType());
+ if (failed(loadNdDistValueTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ loadOp, "Failed to get distributed vector type for the load op");
+ xegpu::TensorDescType distributedTensorDescTy =
+ dropLayouts(loadOp.getTensorDescType()); // Distributed tensor
+ // descriptor type does not
+ // contain layout info.
+ auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+ newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+ resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
+ distributedTensorDescTy, rewriter),
+ removeTemporaryLayoutAttributes(loadOp->getAttrs()));
+ // Set the packed attribute if the layout requires it.
+ newLoadOp.setPacked(hasPackedLayout(layout));
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ // There can be a conflict between the vector type distributed by the
+ // warp op and (xegpu-specific) distributed type supported by the load
+ // op. Resolve these mismatches by inserting a cast.
+ Value tyResolvedVal = resolveDistributedTy(
+ newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
+ rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
+ return success();
+ }
+};
+
+/// Distribute a dpas op feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later). The yield op will
+/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
+/// distributed types does not match expected xegpu SIMT types.
+/// Example:
+/// ```
+/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
+/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (vector<8x1xf32>) {
+/// ...
+/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
+/// vector<8x16xf32>
+/// gpu.yield %dpas
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
+/// vector<8x1xf16>, vector<16x1xf16>) {
+/// ...
+/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
+/// -> vector<8x16xf32>
+/// gpu.yield %dead, %arg0, %arg1
+/// }
+/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
+/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
+/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
+/// vector<8xf32>
+/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
+/// ```
+struct DpasDistribution final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(subgroupOp, llvm::IsaPred<xegpu::DpasOp>);
+ if (!operand)
+ return rewriter.notifyMatchFailure(subgroupOp,
+ "warp result is not a xegpu::Dpas op");
+
+ auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
+ unsigned operandIdx = operand->getOperandNumber();
+ std::string layoutAName =
+ llvm::formatv("{0}{1}", operandLayoutNamePrefix, 0).str();
+ std::string layoutBName =
+ llvm::formatv("{0}{1}", operandLayoutNamePrefix, 1).str();
+ auto layoutCName = llvm::formatv("{0}{1}", resultLayoutNamePrefix, 0).str();
+ xegpu::LayoutAttr layoutA =
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutAName);
+ xegpu::LayoutAttr layoutB =
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutBName);
+ xegpu::LayoutAttr layoutOut =
+ dpasOp->getAttrOfType<xegpu::LayoutAttr>(layoutCName);
+ if (!layoutA || !layoutB || !layoutOut)
+ return rewriter.notifyMatchFailure(
+ dpasOp,
+ "the xegpu::Dpas op lacks layout attribute for A, B or output");
+
+ FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
+ FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
+ FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
+ if (failed(distLhsTypeByWarpOpOrFailure) ||
+ failed(distRhsTypeByWarpOpOrFailure) ||
+ failed(distResultTypeByWarpOpOrFailure))
+ return rewriter.notifyMatchFailure(
+ dpasOp,
+ "Failed to distribute the A, B or output types in xegpu::Dpas op");
+
+ llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
+ dpasOp.getRhs()};
+ llvm::SmallVector<Type, 3> newYieldTypes{
+ distLhsTypeByWarpOpOrFailure.value(),
+ distRhsTypeByWarpOpOrFailure.value()};
+ // Dpas acc operand is optional.
+ if (dpasOp.getAcc()) {
+ newYieldValues.push_back(dpasOp.getAcc());
+ newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
+ }
+ // Create a new warp op without the dpas.
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices);
+
+ FailureOr<VectorType> expectedDistLhsTyOrFailure =
+ xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
+ FailureOr<VectorType> expectedDistRhsTyOrFailure =
+ xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
+ FailureOr<VectorType> expectedDistResultTyOrFailure =
+ xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
+ if (failed(expectedDistLhsTyOrFailure) ||
+ failed(expectedDistRhsTyOrFailure) ||
+ failed(expectedDistResultTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ dpasOp,
+ "Failed to get distributed vector type for the dpas operands.");
+ // Create a new dpas op outside the warp op.
+ rewriter.setInsertionPointAfter(newWarpOp);
+ SmallVector<Value> newDpasOperands;
+ SmallVector<VectorType> newDpasOperandExpectedTypes;
+
+ // Resolve the distributed types with the original types.
+ newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
+ newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
+ VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
+ if (dpasOp.getAcc())
+ newDpasOperandExpectedTypes.push_back(distributedResultTy);
+
+ for (unsigned i = 0; i < newRetIndices.size(); i++) {
+ newDpasOperands.push_back(
+ resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
+ newDpasOperandExpectedTypes[i], rewriter));
+ }
+ Value newDpasOp = rewriter.create<xegpu::DpasOp>(
+ newWarpOp->getLoc(), distributedResultTy, newDpasOperands,
+ removeTemporaryLayoutAttributes(dpasOp->getAttrs()));
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ // Resolve the output type.
+ newDpasOp = resolveDistributedTy(
+ newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter);
+ rewriter.replaceAllUsesWith(distributedVal, newDpasOp);
+ return success();
+ }
+};
+
+} // namespace
+
namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
@@ -646,14 +1427,61 @@ struct XeGPUSubgroupDistributePass final
};
} // namespace
-void XeGPUSubgroupDistributePass::runOnOperation() {
- Operation *op = getOperation();
- RunSGMapPropagation solver(op);
+void xegpu::populateXeGPUSubgroupDistributePatterns(
+ RewritePatternSet &patterns) {
+ patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+ LoadNdDistribution, DpasDistribution>(patterns.getContext());
+}
- // Print the analysis result and exit.
+void XeGPUSubgroupDistributePass::runOnOperation() {
+ auto &analyis = getAnalysis<RunLayoutInfoPropagation>();
+ // Print the analysis result and exit. (for testing purposes)
if (printOnly) {
auto &os = llvm::outs();
- solver.printAnalysisResult(os);
+ analyis.printAnalysisResult(os);
+ return;
+ }
+ auto getPropagatedLayout = [&](Value val) {
+ return analyis.getLayoutInfo(val);
+ };
+
+ // Assign xegpu::LayoutAttr to all ops and their users based on the layout
+ // propagation analysis result.
+ LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
+ if (failed(layoutAssignment.run())) {
+ signalPassFailure();
+ return;
+ }
+
+ // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
+ // operation.
+ {
+ RewritePatternSet patterns(&getContext());
+ patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+
+ if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+ signalPassFailure();
+ return;
+ }
+ }
+ // Finally, do the SIMD to SIMT distribution.
+ RewritePatternSet patterns(&getContext());
+ xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+ // TODO: distributionFn and shuffleFn are not used at this point.
+ auto distributionFn = [](Value val) {
+ VectorType vecType = dyn_cast<VectorType>(val.getType());
+ int64_t vecRank = vecType ? vecType.getRank() : 0;
+ OpBuilder builder(val.getContext());
+ if (vecRank == 0)
+ return AffineMap::get(val.getContext());
+ return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+ };
+ auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+ int64_t warpSz) { return Value(); };
+ vector::populatePropagateWarpVectorDistributionPatterns(
+ patterns, distributionFn, shuffleFn);
+ if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+ signalPassFailure();
return;
}
}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
new file mode 100644
index 0000000000000..afd8e2d5c4df3
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_mlir_dialect_library(MLIRXeGPUUtils
+ XeGPUUtils.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU/Utils
+
+ LINK_LIBS PUBLIC
+ MLIRIR
+ MLIRXeGPUDialect
+ )
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
new file mode 100644
index 0000000000000..6b45ed0ae4ced
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -0,0 +1,85 @@
+//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps ------------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utility methods for working with the XeGPU dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include <cstdint>
+#include <numeric>
+
+using namespace mlir;
+
+FailureOr<VectorType>
+mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
+ auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
+ // It only works for subgroup level layout, which only has lane_layout
+ // and lane_data, and is to distribute a SIMD code into SIMT code.
+ if (!layout || !layout.isSgLayout())
+ return failure();
+
+ SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
+ SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
+ auto tdescShape = tdescTy.getShape();
+ auto elementType = tdescTy.getElementType();
+
+ // compute sgSize by multiply elements of laneLayout
+ // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
+ // e.g. for 1D layout, sgSize = laneLayout[0]
+ auto sgSize = std::accumulate(laneLayout.begin(), laneLayout.end(), 1,
+ std::multiplies<int64_t>());
+
+ // Case 1: regular loads/stores
+ auto scatterAttr = tdescTy.getEncodingAsScatterTensorDescAttr();
+ if (scatterAttr) {
+ auto chunkSize = scatterAttr.getChunkSize().getInt();
+ // Verify if the first dimension of the tensor descriptor shape is
+ // distributable.
+ assert(tdescShape[0] == laneLayout[0] &&
+ "tensor descriptor shape is not distributable");
+ return VectorType::get({chunkSize}, elementType);
+ }
+
+ // Case 2: block loads/stores
+ // Check if the tensor descriptor shape is distributable.
+ int64_t tensorSize = 1;
+ for (auto [tdescDim, laneDim, laneDataDim] :
+ llvm::zip_equal(tdescShape, laneLayout, laneData)) {
+ assert((tdescDim % (laneDim * laneDataDim) == 0) &&
+ "tensor descriptor shape is not distributable");
+ tensorSize *= tdescDim;
+ }
+ // tensorSize must be adjusted for array_length.
+ tensorSize *= tdescTy.getArrayLength();
+
+ return VectorType::get({tensorSize / sgSize}, elementType);
+}
+
+FailureOr<VectorType>
+mlir::xegpu::getDistributedVectorType(VectorType originalType,
+ xegpu::LayoutAttr layout) {
+ int64_t rank = originalType.getRank();
+ // Distributed vector type is only supported for 1D, 2D and 3D vectors.
+ if (rank < 1 || rank > 3)
+ return failure();
+ ArrayRef<int64_t> shape = originalType.getShape();
+ // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
+ // of the 3D vector.
+ int arrayLength = 1;
+ if (rank == 3) {
+ arrayLength = shape[0];
+ shape = shape.drop_front();
+ }
+ auto helperTdescTy = xegpu::TensorDescType::get(
+ shape, originalType.getElementType(), arrayLength,
+ /*boundary_check=*/true,
+ /*memory_space=*/xegpu::MemorySpace::Global, layout);
+ return xegpu::getDistributedVectorType(helperTdescTy);
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
new file mode 100644
index 0000000000000..f8f2cd55c28d0
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribution.mlir
@@ -0,0 +1,162 @@
+// RUN: mlir-opt -xegpu-subgroup-distribute -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @store_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+// CHECK: gpu.return
+gpu.module @test {
+gpu.func @store_nd_1d(%arg0: memref<16xf32>){
+ %c0 = arith.constant 0 : index
+ %1 = arith.constant dense<1.000000e+00> : vector<16xf32>
+ %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %1, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @store_nd_2d(%arg0: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %1 = arith.constant dense<1.000000e+00> : vector<16x16xf16>
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
+
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @test {
+gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32>
+ gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
+// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
+// CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @test {
+gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
+ %2 = vector.extract %1[%c0] : vector<16x16xf16> from vector<2x16x16xf16>
+ %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG3:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T1:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]
+// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>, memref<8x16xf32>) -> (vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
+// CHECK: ^bb0(%[[ARG4:[0-9a-zA-Z]+]]: vector<8x16xf16>, %[[ARG5:[0-9a-zA-Z]+]]: vector<16x16xf16>, %[[ARG6:[0-9a-zA-Z]+]]: vector<8x16xf32>, %[[ARG7:[0-9a-zA-Z]+]]: memref<8x16xf32>):
+// CHECK: gpu.yield %[[ARG4]], %[[ARG5]], %[[ARG6]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK: }
+// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[T1]]#0 : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[T1]]#1 : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG: %[[T4:.*]] = vector.shape_cast %[[T1]]#2 : vector<8x1xf32> to vector<8xf32>
+// CHECK: %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[T4]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG3]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T5]], %[[T6]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+gpu.func @dpas(%arg0: vector<8x16xf16>, %arg1: vector<16x16xf16>, %arg3: vector<8x16xf32>, %arg2: memref<8x16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.dpas %arg0, %arg1, %arg3 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %0, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_dpas_store
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+gpu.module @test {
+gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg3: memref<8x16xf32>){
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+ %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %4 = xegpu.dpas %1, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+ %5 = xegpu.create_nd_tdesc %arg3[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+ xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
+// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
+// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64,
+ %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
+ %c0 = arith.constant 0 : index
+ %0 = xegpu.create_nd_tdesc %arg0 [%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+ %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+ %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+ xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+ gpu.return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index 1ae4348af33e6..a5468681e68dc 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -2,27 +2,27 @@
// CHECK: function: test_dpas_f16:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -40,17 +40,17 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
// -----
// CHECK: function: test_dpas_i8:
// CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.dpas %{{.*}} : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
@@ -62,27 +62,27 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
// -----
// CHECK: function: test_load_with_transpose_effect:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -99,29 +99,29 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
// -----
// CHECK: function: test_vector_transpose:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T4:.*]] = vector.transpose %[[T3]], [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T2]], %[[T4]], %[[CST]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -139,19 +139,19 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
// -----
// CHECK: function: test_extf_truncf:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = arith.extf %[[T1]] : vector<16x16xf16> to vector<16x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T3:.*]] = arith.truncf %[[T2]] : vector<16x16xf32> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -164,29 +164,29 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
// -----
// CHECK: function: test_load_gather_with_transpose_effect:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf16>, vector<16xindex> -> !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load %[[T2]], %[[CST0]] <{transpose}> : !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>>, vector<16xi1> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -204,17 +204,17 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
// -----
// CHECK: function: test_load_gather_1d:
// CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T1]] = xegpu.load %[[T0]], %[[CST0]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
%cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -227,15 +227,15 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
// -----
// CHECK: function: test_store_scatter_with_transpose_effect:
// CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[CST0:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16, 1], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
%cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
%cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -248,15 +248,15 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
// -----
// CHECK: function: test_store_scatter_1d:
// CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST1:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
%cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
%cst_0 = arith.constant dense<true> : vector<16xi1>
@@ -268,27 +268,27 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
// -----
// CHECK: function: test_vector_bitcast_i16_to_i8:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<32x16xi8> -> vector<32x16xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x16xi16> to vector<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.dpas %[[T4]], %[[T3]] : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
@@ -305,29 +305,29 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
// -----
// CHECK: function: test_vector_bitcast_i8_to_f16:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<8x32xi8> -> vector<8x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 2]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 2]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %[[T1]] : !xegpu.tensor_desc<16x32xi8> -> vector<16x32xi8>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [4, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [4, 1]
// CHECK-NEXT: op : %[[T4:.*]] = vector.bitcast %[[T2]] : vector<8x32xi8> to vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = vector.bitcast %[[T3]] : vector<16x32xi8> to vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
@@ -345,21 +345,21 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
// -----
// CHECK: function: test_binary_op_one_use:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T3:.*]] = arith.addf %[[T1]], %[[T2]] : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -373,23 +373,23 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
// -----
// CHECK: function: test_binary_op_multiple_uses:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 3
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T2:.*]] = arith.addf %[[T1]], %[[CST]] : vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -404,39 +404,39 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
// -----
// CHECK: function: test_for_op:
// CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf32>' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 0 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 128 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %{{.*}} = arith.constant 16 : index
-// CHECK-NEXT: sg_map for result #0: Not assigned.
+// CHECK-NEXT: layout for result #0: Not assigned.
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T5:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T7:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T8:.*]] = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : scf.for
-// CHECK-NEXT: sg_map for result #0: Not assigned.
-// CHECK-NEXT: sg_map for result #1: Not assigned.
-// CHECK-NEXT: sg_map for result #2: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: Not assigned.
+// CHECK-NEXT: layout for result #1: Not assigned.
+// CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
@@ -460,23 +460,23 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
// -----
// CHECK: function: test_if_single_use:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : scf.if
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [2, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = scf.if %arg2 -> (vector<16x16xf16>) {
@@ -494,25 +494,25 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
// -----
// CHECK: function: test_if_multiple_uses:
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type 'i1' at index: 2
-// CHECK-NEXT: sg_map : Not assigned.
+// CHECK-NEXT: layout : Not assigned.
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf32>' at index: 3
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 4
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T0:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T3:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T4:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : scf.if
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: op : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
%0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
%1 = scf.if %arg2 -> (vector<16x16xf16>) {
@@ -531,13 +531,13 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
// -----
// CHECK: function: test_vector_outer_reduction:
// CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
@@ -548,13 +548,13 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
// -----
// CHECK: function: test_vector_inner_reduction:
// CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
-// CHECK-NEXT: sg_map : wi_layout: [1, 16], wi_data: [1, 1]
+// CHECK-NEXT: layout : lane_layout: [1, 16], lane_data: [1, 1]
// CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
-// CHECK-NEXT: sg_map : wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout : lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
// CHECK-NEXT: op : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
-// CHECK-NEXT: sg_map for result #0: wi_layout: [16], wi_data: [1]
+// CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
%cst = arith.constant dense<0.000000e+00> : vector<16xf32>
%0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
More information about the Mlir-commits
mailing list