[Mlir-commits] [mlir] 06884d0 - [mlir][xegpu] Bug fix in UpdateNdOffset distribution. (#150545)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Aug 5 14:42:19 PDT 2025
Author: Charitha Saumya
Date: 2025-08-05T14:42:14-07:00
New Revision: 06884d0204ecffc8f1164d520bab6117ae06baee
URL: https://github.com/llvm/llvm-project/commit/06884d0204ecffc8f1164d520bab6117ae06baee
DIFF: https://github.com/llvm/llvm-project/commit/06884d0204ecffc8f1164d520bab6117ae06baee.diff
LOG: [mlir][xegpu] Bug fix in UpdateNdOffset distribution. (#150545)
Reason is UpdateNdOffset source operand not retaining the layouts when
it is yielded by the warp op. `warp_execute_on_lane0` op expects that
TensorDesc type is unchanged during distribution out of its region. we
use UnrealizedCasts to reconcile this mismatch outside the warpOp (via
`resolveDistributedTy`)
Added:
Modified:
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8957ea5399ea2..2088c3c7fc5ec 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -277,22 +277,13 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
descOp, "the tensor descriptor lacks layout attribute");
SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldValues;
- SmallVector<Type> newYieldTypes;
-
- for (Value operand : descOp->getOperands()) {
- newYieldValues.push_back(operand);
- newYieldTypes.push_back(operand.getType());
- }
rewriter.setInsertionPoint(warpOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, /* new yieled values = */ newYieldValues,
- /* new yielded types = */ newYieldTypes, newRetIndices);
+ rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
+ /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
- SmallVector<Value> newDescOperands;
- for (size_t i : newRetIndices) {
- newDescOperands.push_back(newWarpOp.getResult(i));
- }
+ SmallVector<Value> newDescOperands = llvm::map_to_vector(
+ newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::TensorDescType distributedTensorDescTy =
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
@@ -696,39 +687,30 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
warpOp, "warp result is not a xegpu::UpdateNdOffset op");
auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
unsigned operandIdx = operand->getOperandNumber();
- // new update op does not have layout attribute.
- xegpu::TensorDescType newTensorDescTy =
- updateOp.getTensorDescType().dropLayouts();
- SmallVector<Value, 3> newYieldValues;
- SmallVector<Type, 3> newYieldTypes;
- for (Value operand : updateOp->getOperands()) {
- newYieldValues.push_back(operand);
- if (isa<xegpu::TensorDescType>(operand.getType())) {
- newYieldTypes.push_back(newTensorDescTy);
- } else {
- newYieldTypes.push_back(operand.getType());
- }
- }
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
+ rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
+ newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newUpdateOperands;
- for (size_t i : newRetIndices) {
- // For the tensor descriptor operand, the layout attribute is dropped
- // after distribution. Types needs to be resolved in this case.
- if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
- newUpdateOperands.push_back(resolveDistributedTy(
- newWarpOp.getResult(i), newTensorDescTy, rewriter));
- } else {
- newUpdateOperands.push_back(newWarpOp.getResult(i));
- }
- }
+ // new update op does not have layout attribute.
+ xegpu::TensorDescType distributedTensorDescTy =
+ updateOp.getTensorDescType().dropLayouts();
+ SmallVector<Value> newUpdateOperands =
+ llvm::map_to_vector(newRetIndices, [&](size_t i) {
+ // For the tensor descriptor operand, the layout attribute is
+ // dropped after distribution. Types needs to be resolved in this
+ // case.
+ if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
+ return resolveDistributedTy(newWarpOp.getResult(i),
+ distributedTensorDescTy, rewriter);
+ }
+ return newWarpOp.getResult(i);
+ });
// Create a new update op outside the warp op.
auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
- updateOp->getAttrs());
+ rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
+ newUpdateOperands, updateOp->getAttrs());
xegpu::removeLayoutAttrs(newUpdateOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type with the original type.
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index e78ae4a17710b..54ef56e013abb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
// CHECK-LABEL: gpu.func @store_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
@@ -265,6 +265,28 @@ gpu.module @test {
}
}
+// -----
+// Explicitly check that update_nd_offset op's source retain layout when yielded from the warp op (PR150545)
+// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc
+// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
+// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: }
+// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch}
+// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32>
+gpu.module @test {
+ gpu.func @check_update_nd_offset_distributed_tensor_desc() {
+ %c32 = arith.constant 32 : index
+ %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf32>
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.return
+ }
+}
+
// -----
// CHECK-LABEL: gpu.func @prefetch_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
More information about the Mlir-commits
mailing list