[Mlir-commits] [mlir] [MLIR][XeGPU] Add distribution pattern for xegpu load & store matrix from sg to wi (PR #183179)
Nishant Patel
llvmlistbot at llvm.org
Thu Mar 5 09:11:14 PST 2026
https://github.com/nbpatel updated https://github.com/llvm/llvm-project/pull/183179
>From 3681387579ce152513e41bac443f2605984b5355 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Thu, 19 Feb 2026 19:41:41 +0000
Subject: [PATCH 1/4] Add distribution patterns for load & store matrix
---
.../XeGPUSgToWiDistributeExperimental.cpp | 141 +++++++++++++++++-
.../XeGPU/sg-to-wi-experimental-unit.mlir | 64 ++++++++
2 files changed, 203 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index 3787fbb44e1b8..e7d7b443ab52d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -489,6 +490,141 @@ struct SgToWiMultiDimReduction
}
};
+/// Helper to compute distributed coordinates for matrix ops.
+/// When not using subgroup_block_io, each workitem computes its own
+/// coordinates based on the layout and lane ID.
+static SmallVector<Value> computeDistributedCoordsForMatrixOp(
+ ConversionPatternRewriter &rewriter, Location loc,
+ xegpu::DistributeLayoutAttr layout, ArrayRef<int64_t> payloadShape,
+ ValueRange origOffsets) {
+ Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
+ /*upperBound=*/mlir::IntegerAttr());
+ auto maybeCoords =
+ layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
+ if (failed(maybeCoords))
+ return {};
+ assert(maybeCoords.value().size() == 1 &&
+ "Expected one set of distributed offsets");
+ SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
+ rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
+ getAsOpFoldResult(origOffsets));
+ return llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
+}
+
+/// Distributes a subgroup-level LoadMatrix op to workitem-level.
+/// The layout is used to compute the distributed vector type and coordinates.
+/// When subgroup_block_io is set, coordinates are passed through unchanged.
+/// Otherwise, distributed coordinates are computed from the lane ID.
+struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
+ using OpConversionPattern<xegpu::LoadMatrixOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::LoadMatrixOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto layout = op.getLayoutAttr();
+ // If no layout, nothing to do.
+ if (!layout)
+ return failure();
+
+ VectorType sgPayloadTy = dyn_cast<VectorType>(op.getResult().getType());
+ if (!sgPayloadTy)
+ return rewriter.notifyMatchFailure(
+ op, "the matrix op payload must be a vector type");
+
+ auto loc = op.getLoc();
+ auto offsets = op.getMixedOffsets();
+ if (offsets.empty())
+ return rewriter.notifyMatchFailure(op, "the load op must have offsets");
+
+ FailureOr<VectorType> distPayloadTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+ if (failed(distPayloadTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to distribute matrix op payload based on layout.");
+
+ SmallVector<Value> offsetsAsValues =
+ vector::getAsValues(rewriter, loc, offsets);
+
+ SmallVector<Value> newCoords = offsetsAsValues;
+ if (!op.getSubgroupBlockIoAttr()) {
+ newCoords = computeDistributedCoordsForMatrixOp(
+ rewriter, loc, layout, sgPayloadTy.getShape(), offsetsAsValues);
+ if (newCoords.empty())
+ return rewriter.notifyMatchFailure(
+ op, "Failed to compute distributed coordinates.");
+ }
+
+ SmallVector<int64_t> newConstOffsets(op.getConstOffsets().size(),
+ ShapedType::kDynamic);
+ DenseI64ArrayAttr newConstOffsetsAttr =
+ rewriter.getDenseI64ArrayAttr(newConstOffsets);
+
+ auto newOp = xegpu::LoadMatrixOp::create(
+ rewriter, loc, *distPayloadTyOrFailure, adaptor.getMemDesc(),
+ ValueRange(newCoords), newConstOffsetsAttr, op.getSubgroupBlockIoAttr(),
+ xegpu::DistributeLayoutAttr{});
+ rewriter.replaceOp(op, newOp.getResult());
+ return success();
+ }
+};
+
+/// Distributes a subgroup-level StoreMatrix op to workitem-level.
+/// Same coordinate computation logic as LoadMatrix.
+struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
+ using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
+
+ LogicalResult
+ matchAndRewrite(xegpu::StoreMatrixOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto layout = op.getLayoutAttr();
+ // If no layout, nothing to do.
+ if (!layout)
+ return failure();
+
+ VectorType sgPayloadTy = dyn_cast<VectorType>(op.getData().getType());
+ if (!sgPayloadTy)
+ return rewriter.notifyMatchFailure(
+ op, "the matrix op payload must be a vector type");
+
+ auto loc = op.getLoc();
+ auto offsets = op.getMixedOffsets();
+ if (offsets.empty())
+ return rewriter.notifyMatchFailure(op, "the store op must have offsets");
+
+ FailureOr<VectorType> distPayloadTyOrFailure =
+ getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+ if (failed(distPayloadTyOrFailure))
+ return rewriter.notifyMatchFailure(
+ op, "Failed to distribute matrix op payload based on layout.");
+
+ SmallVector<Value> offsetsAsValues =
+ vector::getAsValues(rewriter, loc, offsets);
+
+ SmallVector<Value> newCoords = offsetsAsValues;
+ if (!op.getSubgroupBlockIoAttr()) {
+ newCoords = computeDistributedCoordsForMatrixOp(
+ rewriter, loc, layout, sgPayloadTy.getShape(), offsetsAsValues);
+ if (newCoords.empty())
+ return rewriter.notifyMatchFailure(
+ op, "Failed to compute distributed coordinates.");
+ }
+
+ SmallVector<int64_t> newConstOffsets(op.getConstOffsets().size(),
+ ShapedType::kDynamic);
+ DenseI64ArrayAttr newConstOffsetsAttr =
+ rewriter.getDenseI64ArrayAttr(newConstOffsets);
+
+ xegpu::StoreMatrixOp::create(
+ rewriter, loc, TypeRange{},
+ castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getData()),
+ distPayloadTyOrFailure.value()),
+ adaptor.getMemDesc(), ValueRange(newCoords), newConstOffsetsAttr,
+ op.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+ rewriter.eraseOp(op);
+ return success();
+ }
+};
+
/// This pattern rewrites a subgroup-level vector.multi_reduction op to a series
/// of vector.extract_strided_slice, vector.reduction and
/// vector.insert_strided_slice ops. This is used when the reduction dimension
@@ -730,8 +866,9 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
- SgToWiVectorReduction, SgToWiMultiDimReduction>(
- typeConverter, patterns.getContext());
+ SgToWiVectorReduction, SgToWiMultiDimReduction,
+ SgToWiLoadMatrix, SgToWiStoreMatrix>(typeConverter,
+ patterns.getContext());
}
void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index 1ec0879d4fb47..24b35228d924a 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -318,3 +318,67 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
gpu.return
}
}
+
+// -----
+// load_matrix and store_matrix with coordinate computation (offsets [0,0])
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @load_store_matrix_1
+// CHECK: %[[LANE_ID1:.*]] = gpu.lane_id
+// CHECK: %[[R1:.*]] = arith.remui %[[LANE_ID1]], %{{.*}} : index
+// CHECK: %[[D1:.*]] = arith.divui %[[LANE_ID1]], %{{.*}} : index
+// CHECK: %[[R2:.*]] = arith.remui %[[D1]], %{{.*}} : index
+// CHECK: %[[ROW:.*]] = arith.remui %[[R2]], %{{.*}} : index
+// CHECK: %[[COL:.*]] = arith.remui %[[R1]], %{{.*}} : index
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[ROW]], %[[COL]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
+// CHECK: %[[LANE_ID2:.*]] = gpu.lane_id
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
+ %c0 = arith.constant 0 : index
+ %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+ xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+ gpu.return
+}
+}
+
+// -----
+// load_matrix and store_matrix with non-zero offsets [0,1]
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @load_store_matrix_2
+// CHECK: %[[LANE_ID1:.*]] = gpu.lane_id
+// CHECK: %[[R1:.*]] = arith.remui %[[LANE_ID1]], %{{.*}} : index
+// CHECK: %[[D1:.*]] = arith.divui %[[LANE_ID1]], %{{.*}} : index
+// CHECK: %[[R2:.*]] = arith.remui %[[D1]], %{{.*}} : index
+// CHECK: %[[MUL:.*]] = arith.muli %[[R2]], %{{.*}} : index
+// CHECK: %[[ROW:.*]] = arith.remui %[[MUL]], %{{.*}} : index
+// CHECK: %[[R3:.*]] = arith.remui %[[R1]], %{{.*}} : index
+// CHECK: %[[ADD:.*]] = arith.addi %[[R3]], %{{.*}} : index
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[ROW]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
+// CHECK: %[[LANE_ID2:.*]] = gpu.lane_id
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+ xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+ gpu.return
+}
+}
+
+// -----
+// load_matrix and store_matrix with subgroup_block_io (no coordinate computation)
+gpu.module @xevm_module {
+// CHECK-LABEL: gpu.func @load_store_matrix_3
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index
+gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %1 = xegpu.load_matrix %arg0[%c0, %c1] <{subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> :
+ !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
+ xegpu.store_matrix %1, %arg0[%c0, %c1] <{subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> :
+ vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
+ gpu.return
+}
+}
>From 284e96bfb40b27a2eddc50bdcce22a16a799d33c Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 23 Feb 2026 23:02:40 +0000
Subject: [PATCH 2/4] clang-format
---
.../XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index e7d7b443ab52d..beddebcfb7df1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -866,9 +866,8 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
- SgToWiVectorReduction, SgToWiMultiDimReduction,
- SgToWiLoadMatrix, SgToWiStoreMatrix>(typeConverter,
- patterns.getContext());
+ SgToWiVectorReduction, SgToWiMultiDimReduction, SgToWiLoadMatrix,
+ SgToWiStoreMatrix>(typeConverter, patterns.getContext());
}
void xegpu::populateXeGPUSgToWiLowerVectorMultiReductionAndLegality(
>From 01cc04703141c10e27b4fc02a71dd5405fd62277 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Mon, 23 Feb 2026 23:05:39 +0000
Subject: [PATCH 3/4] clean up
---
.../Transforms/XeGPUSgToWiDistributeExperimental.cpp | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
index beddebcfb7df1..9433b56b7d882 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
@@ -511,10 +511,7 @@ static SmallVector<Value> computeDistributedCoordsForMatrixOp(
return llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
}
-/// Distributes a subgroup-level LoadMatrix op to workitem-level.
-/// The layout is used to compute the distributed vector type and coordinates.
-/// When subgroup_block_io is set, coordinates are passed through unchanged.
-/// Otherwise, distributed coordinates are computed from the lane ID.
+/// This pattern distributes a subgroup-level LoadMatrix op to workitem-level.
struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
using OpConversionPattern<xegpu::LoadMatrixOp>::OpConversionPattern;
@@ -568,8 +565,7 @@ struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
}
};
-/// Distributes a subgroup-level StoreMatrix op to workitem-level.
-/// Same coordinate computation logic as LoadMatrix.
+/// This pattern distributes a subgroup-level StoreMatrix op to workitem-level.
struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
>From 2dd43ca2ced090ef26649560412660a6a2ed8972 Mon Sep 17 00:00:00 2001
From: nbpatel <nishant.b.patel at intel.com>
Date: Wed, 4 Mar 2026 18:21:34 +0000
Subject: [PATCH 4/4] Use CHECK-DAG
---
.../XeGPU/sg-to-wi-experimental-unit.mlir | 28 +++++++++----------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
index f0d38da8ea4fb..d7b4883760c05 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
@@ -466,12 +466,12 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
// load_matrix and store_matrix with coordinate computation (offsets [0,0])
gpu.module @xevm_module {
// CHECK-LABEL: gpu.func @load_store_matrix_1
-// CHECK: %[[LANE_ID1:.*]] = gpu.lane_id
-// CHECK: %[[R1:.*]] = arith.remui %[[LANE_ID1]], %{{.*}} : index
-// CHECK: %[[D1:.*]] = arith.divui %[[LANE_ID1]], %{{.*}} : index
-// CHECK: %[[R2:.*]] = arith.remui %[[D1]], %{{.*}} : index
-// CHECK: %[[ROW:.*]] = arith.remui %[[R2]], %{{.*}} : index
-// CHECK: %[[COL:.*]] = arith.remui %[[R1]], %{{.*}} : index
+// CHECK-DAG: %[[LANE_ID1:.*]] = gpu.lane_id
+// CHECK-DAG: %[[R1:.*]] = arith.remui %[[LANE_ID1]], %{{.*}} : index
+// CHECK-DAG: %[[D1:.*]] = arith.divui %[[LANE_ID1]], %{{.*}} : index
+// CHECK-DAG: %[[R2:.*]] = arith.remui %[[D1]], %{{.*}} : index
+// CHECK-DAG: %[[ROW:.*]] = arith.remui %[[R2]], %{{.*}} : index
+// CHECK-DAG: %[[COL:.*]] = arith.remui %[[R1]], %{{.*}} : index
// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[ROW]], %[[COL]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
// CHECK: %[[LANE_ID2:.*]] = gpu.lane_id
// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
@@ -487,14 +487,14 @@ gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
// load_matrix and store_matrix with non-zero offsets [0,1]
gpu.module @xevm_module {
// CHECK-LABEL: gpu.func @load_store_matrix_2
-// CHECK: %[[LANE_ID1:.*]] = gpu.lane_id
-// CHECK: %[[R1:.*]] = arith.remui %[[LANE_ID1]], %{{.*}} : index
-// CHECK: %[[D1:.*]] = arith.divui %[[LANE_ID1]], %{{.*}} : index
-// CHECK: %[[R2:.*]] = arith.remui %[[D1]], %{{.*}} : index
-// CHECK: %[[MUL:.*]] = arith.muli %[[R2]], %{{.*}} : index
-// CHECK: %[[ROW:.*]] = arith.remui %[[MUL]], %{{.*}} : index
-// CHECK: %[[R3:.*]] = arith.remui %[[R1]], %{{.*}} : index
-// CHECK: %[[ADD:.*]] = arith.addi %[[R3]], %{{.*}} : index
+// CHECK-DAG: %[[LANE_ID1:.*]] = gpu.lane_id
+// CHECK-DAG: %[[R1:.*]] = arith.remui %[[LANE_ID1]], %{{.*}} : index
+// CHECK-DAG: %[[D1:.*]] = arith.divui %[[LANE_ID1]], %{{.*}} : index
+// CHECK-DAG: %[[R2:.*]] = arith.remui %[[D1]], %{{.*}} : index
+// CHECK-DAG: %[[MUL:.*]] = arith.muli %[[R2]], %{{.*}} : index
+// CHECK-DAG: %[[ROW:.*]] = arith.remui %[[MUL]], %{{.*}} : index
+// CHECK-DAG: %[[R3:.*]] = arith.remui %[[R1]], %{{.*}} : index
+// CHECK-DAG: %[[ADD:.*]] = arith.addi %[[R3]], %{{.*}} : index
// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[ROW]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
// CHECK: %[[LANE_ID2:.*]] = gpu.lane_id
// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
More information about the Mlir-commits
mailing list