[Mlir-commits] [mlir] [mlir][xegpu] Bug fix in UpdateNdOffset distribution. (PR #150545)
Charitha Saumya
llvmlistbot at llvm.org
Tue Aug 5 14:41:11 PDT 2025
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/150545
>From 073bd227f16122d6203f21b549d0fff907fc4f5a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 24 Jul 2025 22:39:37 +0000
Subject: [PATCH 1/3] fix bug
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 62 +++++++------------
1 file changed, 22 insertions(+), 40 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8957ea5399ea2..2088c3c7fc5ec 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -277,22 +277,13 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
descOp, "the tensor descriptor lacks layout attribute");
SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldValues;
- SmallVector<Type> newYieldTypes;
-
- for (Value operand : descOp->getOperands()) {
- newYieldValues.push_back(operand);
- newYieldTypes.push_back(operand.getType());
- }
rewriter.setInsertionPoint(warpOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, /* new yieled values = */ newYieldValues,
- /* new yielded types = */ newYieldTypes, newRetIndices);
+ rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
+ /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
- SmallVector<Value> newDescOperands;
- for (size_t i : newRetIndices) {
- newDescOperands.push_back(newWarpOp.getResult(i));
- }
+ SmallVector<Value> newDescOperands = llvm::map_to_vector(
+ newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::TensorDescType distributedTensorDescTy =
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
@@ -696,39 +687,30 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
warpOp, "warp result is not a xegpu::UpdateNdOffset op");
auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
unsigned operandIdx = operand->getOperandNumber();
- // new update op does not have layout attribute.
- xegpu::TensorDescType newTensorDescTy =
- updateOp.getTensorDescType().dropLayouts();
- SmallVector<Value, 3> newYieldValues;
- SmallVector<Type, 3> newYieldTypes;
- for (Value operand : updateOp->getOperands()) {
- newYieldValues.push_back(operand);
- if (isa<xegpu::TensorDescType>(operand.getType())) {
- newYieldTypes.push_back(newTensorDescTy);
- } else {
- newYieldTypes.push_back(operand.getType());
- }
- }
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
+ rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
+ newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newUpdateOperands;
- for (size_t i : newRetIndices) {
- // For the tensor descriptor operand, the layout attribute is dropped
- // after distribution. Types needs to be resolved in this case.
- if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
- newUpdateOperands.push_back(resolveDistributedTy(
- newWarpOp.getResult(i), newTensorDescTy, rewriter));
- } else {
- newUpdateOperands.push_back(newWarpOp.getResult(i));
- }
- }
+ // new update op does not have layout attribute.
+ xegpu::TensorDescType distributedTensorDescTy =
+ updateOp.getTensorDescType().dropLayouts();
+ SmallVector<Value> newUpdateOperands =
+ llvm::map_to_vector(newRetIndices, [&](size_t i) {
+ // For the tensor descriptor operand, the layout attribute is
+ // dropped after distribution. Types needs to be resolved in this
+ // case.
+ if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
+ return resolveDistributedTy(newWarpOp.getResult(i),
+ distributedTensorDescTy, rewriter);
+ }
+ return newWarpOp.getResult(i);
+ });
// Create a new update op outside the warp op.
auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
- updateOp->getAttrs());
+ rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
+ newUpdateOperands, updateOp->getAttrs());
xegpu::removeLayoutAttrs(newUpdateOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type with the original type.
>From 4a6f72f34d4a591ee91f9ea70a212f1494cfe447 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 24 Jul 2025 23:17:08 +0000
Subject: [PATCH 2/3] add test
---
.../Dialect/XeGPU/subgroup-distribute.mlir | 24 ++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index e78ae4a17710b..4bfa797e2a9b3 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
// CHECK-LABEL: gpu.func @store_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
@@ -265,6 +265,28 @@ gpu.module @test {
}
}
+// -----
+// Explicitly check that update_nd_offset distributed tensor descriptor retains the layouts.
+// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
+// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: }
+// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch}
+// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+ gpu.func @check_update_nd_offset_distributed_tensor_desc() {
+ %c32 = arith.constant 32 : index
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf32>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
// -----
// CHECK-LABEL: gpu.func @prefetch_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
>From 4bb2207b49d766311db295bd141a943294cb6989 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 5 Aug 2025 21:26:50 +0000
Subject: [PATCH 3/3] fix
---
mlir/test/Dialect/XeGPU/subgroup-distribute.mlir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 4bfa797e2a9b3..54ef56e013abb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -266,7 +266,7 @@ gpu.module @test {
}
// -----
-// Explicitly check that update_nd_offset distributed tensor descriptor retains the layouts.
+// Explicitly check that update_nd_offset op's source retain layout when yielded from the warp op (PR150545)
// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc
// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
More information about the Mlir-commits
mailing list