[Mlir-commits] [mlir] [mlir][xegpu] Add layout based SIMT distribution support for `vector.extract/insert_strided_slice` (PR #168626)
Charitha Saumya
llvmlistbot at llvm.org
Wed Nov 19 10:33:01 PST 2025
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/168626
>From 4bef60fe9e55eba963179888e28c77568aeccf7e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 17 Nov 2025 19:31:51 +0000
Subject: [PATCH 1/9] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 34 +++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index bbd7733e89c29..a125ed18119be 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1472,6 +1472,40 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
}
};
+static SmallVector<int64_t> getDistributedDims(VectorType sequentialType,
+ VectorType distributedType) {
+ assert(sequentialType.getRank() == distributedType.getRank() &&
+ "sequential and distributed vector types must have the same rank");
+ SmallVector<int64_t> distributedDims;
+ for (int64_t i = 0; i < sequentialType.getRank(); ++i) {
+ if (distributedType.getDimSize(i) != sequentialType.getDimSize(i)) {
+ distributedDims.push_back(i);
+ }
+ }
+ return distributedDims;
+}
+
+struct VectorExtractStridedSliceDistribution
+ : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
+ if (!operand)
+ return failure();
+ auto extractOp =
+ cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
+ unsigned operandIdx = operand->getOperandNumber();
+ auto distributedType =
+ cast<VectorType>(warpOp.getResult(operandIdx).getType());
+ // Find the distributed dimension. There should be exactly one.
+ auto yieldedType = cast<VectorType>(operand->get().getType());
+ auto distributedDims = getDistributedDims(yieldedType, distributedType);
+ return success();
+ }
+};
+
/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
/// outside of the warp op.
>From a261edcc92360fd7d6679bffe6923cdf1b271a11 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 17 Nov 2025 23:41:52 +0000
Subject: [PATCH 2/9] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 124 +++++++++++++++---
1 file changed, 108 insertions(+), 16 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 9865f31e2cbcd..e4f17f0abdc6b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -35,6 +35,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
namespace mlir {
namespace xegpu {
@@ -174,6 +175,19 @@ static bool requireTranspose(const xegpu::LayoutAttr layout,
return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
}
+static SmallVector<int64_t> getDistributedDims(VectorType sequentialType,
+ VectorType distributedType) {
+ assert(sequentialType.getRank() == distributedType.getRank() &&
+ "sequential and distributed vector types must have the same rank");
+ SmallVector<int64_t> distributedDims;
+ for (int64_t i = 0; i < sequentialType.getRank(); ++i) {
+ if (distributedType.getDimSize(i) != sequentialType.getDimSize(i)) {
+ distributedDims.push_back(i);
+ }
+ }
+ return distributedDims;
+}
+
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
/// contained within a WarpExecuteOnLane0Op.
@@ -1471,19 +1485,6 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
}
};
-static SmallVector<int64_t> getDistributedDims(VectorType sequentialType,
- VectorType distributedType) {
- assert(sequentialType.getRank() == distributedType.getRank() &&
- "sequential and distributed vector types must have the same rank");
- SmallVector<int64_t> distributedDims;
- for (int64_t i = 0; i < sequentialType.getRank(); ++i) {
- if (distributedType.getDimSize(i) != sequentialType.getDimSize(i)) {
- distributedDims.push_back(i);
- }
- }
- return distributedDims;
-}
-
struct VectorExtractStridedSliceDistribution
: public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
@@ -1501,6 +1502,96 @@ struct VectorExtractStridedSliceDistribution
// Find the distributed dimension. There should be exactly one.
auto yieldedType = cast<VectorType>(operand->get().getType());
auto distributedDims = getDistributedDims(yieldedType, distributedType);
+ // Only single dimension distribution is supported.
+ if (distributedDims.size() != 1)
+ return rewriter.notifyMatchFailure(
+ warpOp, "Expecting source to be distributed in a single dimension.");
+ int64_t distributedDim = distributedDims[0];
+ // Check if the distributed dimension is fully extracted. If so, we exit
+ // early becuase this case already handled by vector distribution patterns.
+ // Distributed dimension is fully extracted if:
+ // 1) Distributed dim comes after all the extracted dimensions.
+ // 2) Or, the size extacted along the distributed dimension is equal the
+ // size of that dim in source vector.
+ auto extractedSizes = extractOp.getSizes();
+ if (distributedDim >= static_cast<int64_t>(extractedSizes.size()))
+ return rewriter.notifyMatchFailure(
+ warpOp, "Distributed dimension is fully extracted, skipping.");
+
+ int distrDimExtractedSize =
+ cast<IntegerAttr>(extractOp.getSizes()[distributedDim]).getInt();
+ if (distrDimExtractedSize ==
+ extractOp.getSourceVectorType().getShape()[distributedDim])
+ return rewriter.notifyMatchFailure(
+ warpOp, "Distributed dimension is fully extracted, skipping.");
+
+ // Check if the size extracted along the distributed dimension is a multiple
+ // of the source dim size and should be distributable to lanes.
+ int64_t sourceDisrDimSize = yieldedType.getShape()[distributedDim];
+ if (sourceDisrDimSize % distrDimExtractedSize != 0)
+ return rewriter.notifyMatchFailure(
+ warpOp,
+ "Extracted size along distributed dimension is not a multiple of "
+ "source dim size.");
+ auto sourceLayout =
+ xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0));
+ auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
+ // Because only single dimension distribution is supported, lane layout size
+ // at the distributed dim must be the subgroup size.
+ int subgroupSize = sourceLaneLayout[distributedDim];
+ // Check if the distributed extracted dim is a multiple of the lane size.
+ if (distrDimExtractedSize % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ warpOp,
+ "Extracted size along distributed dimension is not a multiple of "
+ "lane size in source layout.");
+ auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
+ // We expect lane data to be all ones in this case.
+ if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
+ return rewriter.notifyMatchFailure(
+ warpOp, "Expecting unit lane data in source layout");
+ // The offsets in the distributed dimention must be a multiple of subgroup
+ // size.
+ int64_t distrDimOffset =
+ cast<IntegerAttr>(extractOp.getOffsets()[distributedDim]).getInt();
+ if (distrDimOffset % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(warpOp,
+ "Offset along distributed dimension "
+ "is not a multiple of subgroup size.");
+ // Do the distribution by yielding the source of the extract op from
+ // the warp op and creating a new extract op outside the warp op.
+ FailureOr<VectorType> sourceDistTypeOrFailure =
+ getDistVecTypeBasedOnLaneLayout(sourceLayout,
+ extractOp.getSourceVectorType());
+ if (failed(sourceDistTypeOrFailure))
+ return rewriter.notifyMatchFailure(
+ warpOp, "failed to get distributed vector type for source");
+ VectorType sourceDistType = sourceDistTypeOrFailure.value();
+ // Create a new warp op that yields the source of the extract op.
+ SmallVector<size_t> newRetIndices;
+ auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, {extractOp.getSource()}, {sourceDistType},
+ newRetIndices);
+ rewriter.setInsertionPointAfter(newWarpOp);
+ // Distributed sizes and offsets must be adjusted.
+ SmallVector<Attribute> distributedSizes = llvm::map_to_vector(
+ extractOp.getSizes(), [](Attribute attr) { return attr; });
+ SmallVector<Attribute> distributedOffsets = llvm::map_to_vector(
+ extractOp.getOffsets(), [](Attribute attr) { return attr; });
+ // Update the distributed sizes to match the distributed type.
+ distributedSizes[distributedDim] =
+ rewriter.getI64IntegerAttr(distributedType.getDimSize(distributedDim));
+ // Update the distributed offsets to match round robin distribution.
+ distributedOffsets[distributedDim] = rewriter.getI64IntegerAttr(
+ distrDimOffset / subgroupSize); // because lane data is 1
+ Value source = newWarpOp.getResult(newRetIndices[0]);
+ // Create a new extract op outside the warp op.
+ Value newExtractOp = vector::ExtractStridedSliceOp::create(
+ rewriter, extractOp.getLoc(), distributedType, source,
+ ArrayAttr::get(rewriter.getContext(), distributedOffsets),
+ ArrayAttr::get(rewriter.getContext(), distributedSizes),
+ extractOp.getStrides());
+ rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
return success();
}
};
@@ -1662,9 +1753,10 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
MemrefExtractAlignedPointerAsIndexDistribution>(
patterns.getContext(),
/*pattern benefit=*/regularPatternBenefit);
- patterns.add<VectorShapeCastDistribution>(
- patterns.getContext(),
- /*pattern benefit=*/highPatternBenefit);
+ patterns
+ .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution>(
+ patterns.getContext(),
+ /*pattern benefit=*/highPatternBenefit);
}
void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
>From 59f90b4bedf8684948c2851a199bec95b1aaecd1 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Nov 2025 19:30:51 +0000
Subject: [PATCH 3/9] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 143 ++++++++++++++++--
1 file changed, 131 insertions(+), 12 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index e4f17f0abdc6b..a3e7f8469cd93 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1535,16 +1535,21 @@ struct VectorExtractStridedSliceDistribution
"source dim size.");
auto sourceLayout =
xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0));
+ if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
+ return rewriter.notifyMatchFailure(
+ warpOp, "the source of extract_strided_slice op lacks distribution "
+ "layout");
auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
// Because only single dimension distribution is supported, lane layout size
// at the distributed dim must be the subgroup size.
int subgroupSize = sourceLaneLayout[distributedDim];
- // Check if the distributed extracted dim is a multiple of the lane size.
+ // Check if the distributed extracted dim is a multiple of the subgroup
+ // size.
if (distrDimExtractedSize % subgroupSize != 0)
return rewriter.notifyMatchFailure(
warpOp,
"Extracted size along distributed dimension is not a multiple of "
- "lane size in source layout.");
+ "subgroup size in source layout.");
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
// We expect lane data to be all ones in this case.
if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
@@ -1560,13 +1565,10 @@ struct VectorExtractStridedSliceDistribution
"is not a multiple of subgroup size.");
// Do the distribution by yielding the source of the extract op from
// the warp op and creating a new extract op outside the warp op.
- FailureOr<VectorType> sourceDistTypeOrFailure =
+ VectorType sourceDistType =
getDistVecTypeBasedOnLaneLayout(sourceLayout,
- extractOp.getSourceVectorType());
- if (failed(sourceDistTypeOrFailure))
- return rewriter.notifyMatchFailure(
- warpOp, "failed to get distributed vector type for source");
- VectorType sourceDistType = sourceDistTypeOrFailure.value();
+ extractOp.getSourceVectorType())
+ .value();
// Create a new warp op that yields the source of the extract op.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -1581,9 +1583,10 @@ struct VectorExtractStridedSliceDistribution
// Update the distributed sizes to match the distributed type.
distributedSizes[distributedDim] =
rewriter.getI64IntegerAttr(distributedType.getDimSize(distributedDim));
- // Update the distributed offsets to match round robin distribution.
- distributedOffsets[distributedDim] = rewriter.getI64IntegerAttr(
- distrDimOffset / subgroupSize); // because lane data is 1
+ // Update the distributed offsets to match round robin distribution (i.e.
+ // each lane owns data at `subgroupSize` stride given unit lane data).
+ distributedOffsets[distributedDim] =
+ rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
Value source = newWarpOp.getResult(newRetIndices[0]);
// Create a new extract op outside the warp op.
Value newExtractOp = vector::ExtractStridedSliceOp::create(
@@ -1596,6 +1599,121 @@ struct VectorExtractStridedSliceDistribution
}
};
+struct VectorInsertStridedSliceDistribution
+ : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override {
+ OpOperand *operand =
+ getWarpResult(warpOp, llvm::IsaPred<vector::InsertStridedSliceOp>);
+ if (!operand)
+ return failure();
+ unsigned int operandNumber = operand->getOperandNumber();
+ auto insertOp =
+ operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
+ auto distributedType =
+ cast<VectorType>(warpOp.getResult(operandNumber).getType());
+ // Find the distributed dimension of the dest vector. There should be
+ // exactly one.
+ auto yieldedType = cast<VectorType>(operand->get().getType());
+ auto destDistributedDims = getDistributedDims(yieldedType, distributedType);
+ // Only single dimension distribution is supported.
+ if (destDistributedDims.size() != 1)
+ return rewriter.notifyMatchFailure(
+ warpOp, "Expecting source to be distributed in a single dimension.");
+ int64_t destDistributedDim = destDistributedDims[0];
+
+ VectorType srcType = insertOp.getSourceVectorType();
+ VectorType destType = insertOp.getDestVectorType();
+ // Currently we require that both source (kD) and dest (nD) vectors are
+ // distributed. This requires that distributedDim (d) is contained in the
+ // last k dims of the dest vector (d >= n - k).
+ int64_t sourceDistributedDim =
+ destDistributedDim - (destType.getRank() - srcType.getRank());
+ if (sourceDistributedDim < 0)
+ return rewriter.notifyMatchFailure(
+ insertOp, "distributed dimension must be in the last k (i.e. source "
+ "rank) dims of dest vector");
+ // If the distributed dimension is fully inserted, skip. This case is
+ // already handled by vector distribution patterns.
+ int64_t destDistrDimSize = destType.getDimSize(destDistributedDim);
+ int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
+ if (srcDistrDimSize == destDistrDimSize)
+ return rewriter.notifyMatchFailure(
+ insertOp, "distributed dimension is fully inserted. This case "
+ "is handled by vector distribution.");
+ // Obtain the source and dest layouts.
+ auto destLayout = xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(1));
+ auto sourceLayout =
+ xegpu::getDistributeLayoutAttr(insertOp->getOpOperand(0));
+ if (!destLayout || !sourceLayout ||
+ destLayout.getEffectiveLaneLayoutAsInt().empty() ||
+ sourceLayout.getEffectiveLaneLayoutAsInt().empty())
+ return rewriter.notifyMatchFailure(
+ warpOp, "the source or dest of insert_strided_slice op lacks "
+ "distribution layout");
+ // Because only single dimension distribution is supported, lane layout
+ // size at the distributed dim must be the subgroup size.
+ int subgroupSize =
+ destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
+ // We require that source and dest lane data are all ones to ensure uniform
+ // round robin distribution.
+ auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
+ auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
+ if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
+ !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
+ return rewriter.notifyMatchFailure(
+ warpOp, "Expecting unit lane data in source and dest layouts");
+ // Distributed dim sizes must be multiples of subgroup size.
+ if (destDistrDimSize % subgroupSize != 0 ||
+ srcDistrDimSize % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ warpOp,
+ "Distributed dimension size in source or dest is not a multiple of "
+ "subgroup size.");
+ // Offsets in the distributed dimension must be multiples of subgroup size.
+ int64_t destDistrDimOffset =
+ cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
+ if (destDistrDimOffset % subgroupSize != 0)
+ return rewriter.notifyMatchFailure(
+ warpOp,
+ "Offset along distributed dimension in dest is not a multiple of "
+ "subgroup size.");
+ // Do the distribution by yielding the source and dest of the insert op from
+ // the warp op and creating a new insert op outside the warp op.
+ VectorType sourceDistType =
+ getDistVecTypeBasedOnLaneLayout(sourceLayout,
+ insertOp.getSourceVectorType())
+ .value();
+ VectorType destDistType = getDistVecTypeBasedOnLaneLayout(
+ destLayout, insertOp.getDestVectorType())
+ .value();
+ // Create a new warp op that yields the source and dest of the insert op.
+ SmallVector<size_t> newRetIndices;
+ auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
+ {sourceDistType, destDistType}, newRetIndices);
+ rewriter.setInsertionPointAfter(newWarpOp);
+ // Distributed offsets must be adjusted.
+ SmallVector<Attribute> distributedOffsets = llvm::map_to_vector(
+ insertOp.getOffsets(), [](Attribute attr) { return attr; });
+ // Update the distributed offsets to match round robin distribution (i.e.
+ // each lane owns data at `subgroupSize` stride given unit lane data).
+ distributedOffsets[destDistributedDim] =
+ rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
+ Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
+ Value dest = newWarpOp.getResult(newRetIndices[1]);
+ // Create a new insert op outside the warp op.
+ Value newInsertOp = vector::InsertStridedSliceOp::create(
+ rewriter, insertOp.getLoc(), destDistType, valueToStore, dest,
+ ArrayAttr::get(rewriter.getContext(), distributedOffsets),
+ insertOp.getStrides());
+ rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
+ newInsertOp);
+ return success();
+ }
+};
+
/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
/// outside of the warp op.
@@ -1754,7 +1872,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
patterns.getContext(),
/*pattern benefit=*/regularPatternBenefit);
patterns
- .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution>(
+ .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
+ VectorInsertStridedSliceDistribution>(
patterns.getContext(),
/*pattern benefit=*/highPatternBenefit);
}
>From f748b80547e00fbaf60688b5696a8f40cd29cbe5 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Nov 2025 22:34:11 +0000
Subject: [PATCH 4/9] save work
---
.../XeGPU/subgroup-distribute-unit.mlir | 674 ++++++++++--------
1 file changed, 386 insertions(+), 288 deletions(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index f233dff609f2b..d8ed46646810d 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute -allow-unregistered-dialect \
-// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s
-
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute \
+// RUN: -allow-unregistered-dialect -canonicalize -cse %s | FileCheck %s
+gpu.module @xevm_module{
// CHECK-LABEL: gpu.func @store_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
@@ -11,20 +11,17 @@
// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
// CHECK-NEXT: xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.module @xevm_module{
- gpu.func @store_nd_1d(%laneid: index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %cst = "some_op"() : () -> vector<16xf32>
- xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- }
- gpu.return
+gpu.func @store_nd_1d(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %cst = "some_op"() : () -> vector<16xf32>
+ xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
}
+ gpu.return
}
-// -----
// CHECK-LABEL: gpu.func @store_nd_2d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
@@ -37,22 +34,18 @@ gpu.module @xevm_module{
// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
// CHECK-NEXT: xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
- gpu.func @store_nd_2d(%laneid : index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %cst = "some_op"() : () -> vector<16x16xf16>
- xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- }
- gpu.return
+gpu.func @store_nd_2d(%laneid : index) {
+ %c0 = arith.constant 0 : index
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %cst = "some_op"() : () -> vector<16x16xf16>
+ xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
}
+ gpu.return
}
-
-// -----
// CHECK-LABEL: gpu.func @load_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>,
@@ -63,21 +56,19 @@ gpu.module @xevm_module{
// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
// CHECK-NEXT: xegpu.load_nd %[[T1]][%[[W]]#2] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-gpu.module @xevm_module{
- gpu.func @load_nd_1d(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
- !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
- gpu.yield %1 : vector<16xf32>
- }
- "some_user_op"(%r) : (vector<1xf32>) -> ()
- gpu.return
+gpu.func @load_nd_1d(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+ !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
+ gpu.yield %1 : vector<16xf32>
}
+ "some_user_op"(%r) : (vector<1xf32>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @load_nd_2d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
@@ -89,21 +80,19 @@ gpu.module @xevm_module{
// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
// CHECK: vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16>
-gpu.module @xevm_module{
- gpu.func @load_nd_2d(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
- gpu.yield %1 : vector<16x16xf16>
- }
- "some_user_op"(%r) : (vector<16x1xf16>) -> ()
- gpu.return
+gpu.func @load_nd_2d(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+ gpu.yield %1 : vector<16x16xf16>
}
+ "some_user_op"(%r) : (vector<16x1xf16>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @load_nd_array_length
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>,
@@ -118,23 +107,21 @@ gpu.module @xevm_module{
// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16,
// CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
// CHECK-NEXT: vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16>
-gpu.module @xevm_module{
- gpu.func @load_nd_array_length(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
- gpu.yield %1 : vector<2x16x16xf16>
- }
- "some_user_op"(%r) : (vector<2x16x1xf16>) -> ()
- gpu.return
+gpu.func @load_nd_array_length(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) {
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+ #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
+ gpu.yield %1 : vector<2x16x16xf16>
}
+ "some_user_op"(%r) : (vector<2x16x1xf16>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @dpas
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] ->
@@ -146,29 +133,27 @@ gpu.module @xevm_module{
// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32>
// CHECK-NEXT: %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
// CHECK-NEXT: vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-gpu.module @xevm_module{
- gpu.func @dpas(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
- %0 = "some_op"() : () -> vector<8x16xf16>
- %1 = "some_op"() : () -> vector<16x16xf16>
- %2 = "some_op"() : () -> vector<8x16xf32>
- %3 = xegpu.dpas %0, %1, %2
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- gpu.yield %3 : vector<8x16xf32>
- }
- "some_user_op"(%r) : (vector<8x1xf32>) -> ()
- gpu.return
+gpu.func @dpas(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
+ %0 = "some_op"() : () -> vector<8x16xf16>
+ %1 = "some_op"() : () -> vector<16x16xf16>
+ %2 = "some_op"() : () -> vector<8x16xf32>
+ %3 = xegpu.dpas %0, %1, %2
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+ layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+ gpu.yield %3 : vector<8x16xf32>
}
+ "some_user_op"(%r) : (vector<8x1xf32>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16,
@@ -178,21 +163,19 @@ gpu.module @xevm_module{
// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
// CHECK-NEXT: builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16,
// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch}
-gpu.module @xevm_module{
- gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
- %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- }
- "some_user_op"(%r)
- : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
- gpu.return
+gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
+ %c0 = arith.constant 0 : index
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+ %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
+ !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
}
+ "some_user_op"(%r)
+ : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @prefetch_2d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16,
@@ -204,21 +187,19 @@ gpu.module @xevm_module{
// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2]
// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
- gpu.func @prefetch_2d(%laneid: index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : ()
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.prefetch_nd %0[%c0, %c0]
- <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- }
- gpu.return
+gpu.func @prefetch_2d(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ %0 = "some_op"() : ()
+ -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ xegpu.prefetch_nd %0[%c0, %c0]
+ <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
}
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @prefetch_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16,
@@ -229,44 +210,40 @@ gpu.module @xevm_module{
// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch}
// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>,
// CHECK-SAME: l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
- gpu.func @prefetch_1d(%laneid: index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : ()
- -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- xegpu.prefetch_nd %0[%c0]
- <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
- : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- }
- gpu.return
+gpu.func @prefetch_1d(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ %0 = "some_op"() : ()
+ -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ xegpu.prefetch_nd %0[%c0]
+ <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
}
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
// CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
// CHECK: gpu.yield %{{.*}}
// CHECK: }
// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
// CHECK: gpu.barrier
-gpu.module @xevm_module{
- gpu.func @gpu_barrier(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %1 = xegpu.load_nd %0[%c0]
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
- gpu.barrier
- gpu.yield %1 : vector<16xf16>
- }
- "some_user_op"(%r) : (vector<1xf16>) -> ()
- gpu.return
+gpu.func @gpu_barrier(%laneid: index) {
+ %c0 = arith.constant 0 : index
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
+ %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %1 = xegpu.load_nd %0[%c0]
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
+ gpu.barrier
+ gpu.yield %1 : vector<16xf16>
}
+ "some_user_op"(%r) : (vector<1xf16>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
@@ -285,7 +262,6 @@ gpu.module @xevm_module{
// CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
// CHECK: %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
// CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32>
-gpu.module @xevm_module{
gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -307,9 +283,8 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
"some_user_op"(%r) : (vector<2xf32>) -> ()
gpu.return
}
-}
-// -----
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
@@ -320,7 +295,6 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index)
// CHECK-NEXT: %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32>
// CHECK-NEXT: gpu.yield %[[T6]] : vector<2xf32>
// CHECK-NEXT: }
-gpu.module @xevm_module{
gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -342,9 +316,8 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
"some_user_op"(%r) : (vector<2xf32>) -> ()
gpu.return
}
-}
-// -----
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
@@ -358,7 +331,6 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index)
// CHECK: %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32
// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
-gpu.module @xevm_module{
gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -380,9 +352,8 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
"some_user_op"(%r) : (vector<2xf32>) -> ()
gpu.return
}
-}
-// -----
+
// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32>
@@ -397,7 +368,6 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index)
// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
// CHECK: gpu.yield %[[T7]] : vector<2xf32>
// CHECK: }
-gpu.module @xevm_module{
gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
%c0 = arith.constant 0 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
@@ -419,9 +389,8 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
"some_user_op"(%r) : (vector<2xf32>) -> ()
gpu.return
}
-}
-// -----
+
// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
@@ -434,35 +403,33 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index)
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
- gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %1 = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<1>: vector<16xi1>
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
- {
- layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
- }
- : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
- layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- }
- gpu.return
+gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ %1 = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ dense<1>: vector<16xi1>
+ %offset = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ dense<12> : vector<16xindex>
+ %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+ {
+ layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+ }
+ : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+ xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
+ layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
}
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
@@ -475,156 +442,144 @@ gpu.module @xevm_module{
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
- gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %1 = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<1> : vector<16xi1>
- %offset = arith.constant
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1
- {
- layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
- xegpu.store %3, %src[%offset], %1
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
+ gpu.warp_execute_on_lane_0(%laneid)[16] {
+ %1 = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ dense<1> : vector<16xi1>
+ %offset = arith.constant
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+ dense<12> : vector<16xindex>
+ %3 = xegpu.load %src[%offset], %1
+ {
+ layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+ xegpu.store %3, %src[%offset], %1
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
}
- gpu.return
+ : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
}
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {
// CHECK: gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
// CHECK-NEXT: }
// CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
// CHECK-NEXT: arith.index_cast %[[INTPTR]] : index to i64
-gpu.module @xevm_module{
- gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
- %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
- gpu.yield %ptr : index
- }
- %ptr_i64 = arith.index_cast %r : index to i64
- "some_user_op"(%ptr_i64) : (i64) -> ()
- gpu.return
+gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
+ %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
+ gpu.yield %ptr : index
}
+ %ptr_i64 = arith.index_cast %r : index to i64
+ "some_user_op"(%ptr_i64) : (i64) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @vector_transpose(
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) {
// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32>
// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32>
// CHECK-NEXT: }
// CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
-gpu.module @xevm_module{
- gpu.func @vector_transpose(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : () -> (vector<16x2xf32>)
- %transpose = vector.transpose %cst, [1, 0]
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x2xf32> to vector<2x16xf32>
- gpu.yield %transpose : vector<2x16xf32>
- }
- "some_user_op"(%r) : (vector<2x1xf32>) -> ()
- gpu.return
+gpu.func @vector_transpose(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+ : () -> (vector<16x2xf32>)
+ %transpose = vector.transpose %cst, [1, 0]
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16x2xf32> to vector<2x16xf32>
+ gpu.yield %transpose : vector<2x16xf32>
}
+ "some_user_op"(%r) : (vector<2x1xf32>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @vector_bitcast(
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) {
// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8>
// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8>
// CHECK: }
// CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
-gpu.module @xevm_module{
- gpu.func @vector_bitcast(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
- : () -> (vector<4x32xi8>)
- %bitcast = vector.bitcast %cst
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<4x32xi8> to vector<4x16xi16>
- gpu.yield %bitcast : vector<4x16xi16>
- }
- "some_user_op"(%r) : (vector<4x1xi16>) -> ()
- gpu.return
+gpu.func @vector_bitcast(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+ : () -> (vector<4x32xi8>)
+ %bitcast = vector.bitcast %cst
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<4x32xi8> to vector<4x16xi16>
+ gpu.yield %bitcast : vector<4x16xi16>
}
+ "some_user_op"(%r) : (vector<4x1xi16>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing
// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) {
// CHECK: gpu.yield %{{.*}} : vector<1x16xf32>, vector<16xf32>
// CHECK: }
// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1xf32> to vector<1x1xf32>
-gpu.module @xevm_module {
- gpu.func @vector_shapecast_rank_increasing(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
- : () -> (vector<16xf32>)
- %cast = vector.shape_cast %cst
- {
- layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16xf32> to vector<1x16xf32>
- gpu.yield %cast : vector<1x16xf32>
- }
- "some_user_op"(%r) : (vector<1x1xf32>) -> ()
- gpu.return
+gpu.func @vector_shapecast_rank_increasing(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+ : () -> (vector<16xf32>)
+ %cast = vector.shape_cast %cst
+ {
+ layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16xf32> to vector<1x16xf32>
+ gpu.yield %cast : vector<1x16xf32>
}
+ "some_user_op"(%r) : (vector<1x1xf32>) -> ()
+ gpu.return
}
-// -----
+
// CHECK-LABEL: gpu.func @vector_shapecast_rank_reducing(
// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1xf32>, vector<1x1xf32>) {
// CHECK: gpu.yield %{{.*}} : vector<16xf32>, vector<1x16xf32>
// CHECK: }
// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1x1xf32> to vector<1xf32>
-gpu.module @xevm_module {
- gpu.func @vector_shapecast_rank_reducing(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<1x16xf32>)
- %cast = vector.shape_cast %cst
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
- }
- : vector<1x16xf32> to vector<16xf32>
- gpu.yield %cast : vector<16xf32>
- }
- "some_user_op"(%r) : (vector<1xf32>) -> ()
- gpu.return
+gpu.func @vector_shapecast_rank_reducing(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+ : () -> (vector<1x16xf32>)
+ %cast = vector.shape_cast %cst
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
+ }
+ : vector<1x16xf32> to vector<16xf32>
+ gpu.yield %cast : vector<16xf32>
}
+ "some_user_op"(%r) : (vector<1xf32>) -> ()
+ gpu.return
}
-// -----
+
// NOTE: Layouts are still valid, but distribution still requires a slice layout for the operand.
//
// CHECK-LABEL: gpu.func @vector_shapecast_unsupported
@@ -634,21 +589,164 @@ gpu.module @xevm_module {
// CHECK: }
// CHECK: "some_user_op"(%[[W]]) : (vector<1x1xf32>) -> ()
// CHECK: gpu.return
-gpu.module @xevm_module {
- gpu.func @vector_shapecast_unsupported(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> }
- : () -> (vector<16xf32>)
- %cast = vector.shape_cast %cst
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16xf32> to vector<1x16xf32>
- gpu.yield %cast : vector<1x16xf32>
+gpu.func @vector_shapecast_unsupported(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
+ %cst = "some_op"()
+ {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> }
+ : () -> (vector<16xf32>)
+ %cast = vector.shape_cast %cst
+ {
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<16xf32> to vector<1x16xf32>
+ gpu.yield %cast : vector<1x16xf32>
+ }
+ "some_user_op"(%r) : (vector<1x1xf32>) -> ()
+ gpu.return
+}
+
+
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
+// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x16xf32>, vector<2x16xf32>) {
+// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<32x16xf32>
+// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<16x16xf32>, vector<32x16xf32>
+// CHECK: }
+// CHECK-NEXT: %[[T1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16xf32> to vector<1x16xf32>
+// CHECK-NEXT: "some_use"(%[[T2]]) : (vector<1x16xf32>) -> ()
+gpu.func @vector_extract_strided_slice_outer_distributed(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x16xf32>) {
+ %0 = "some_def"() : () -> (vector<32x16xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+ }
+ : vector<32x16xf32> to vector<16x16xf32>
+ gpu.yield %1 : vector<16x16xf32>
+ }
+ "some_use"(%r) : (vector<1x16xf32>) -> ()
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
+// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x4xf32>) {
+// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x64xf32>
+// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x64xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME: {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
+// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
+gpu.func @vector_extract_strided_slice_inner_distributed(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
+ %0 = "some_def"() : () -> (vector<24x64xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+ }
+ : vector<24x64xf32> to vector<8x16xf32>
+ gpu.yield %1 : vector<8x16xf32>
+ }
+ "some_use"(%r) : (vector<8x1xf32>) -> ()
+ gpu.return
+}
+
+
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
+// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<4xf32>) {
+// CHECK: %[[S:.*]] = "some_def"() : () -> vector<64xf32>
+// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<32xf32>, vector<64xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME: {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<2xf32>) -> ()
+gpu.func @vector_extract_strided_slice_1d(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<64xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<64xf32> to vector<32xf32>
+ gpu.yield %1 : vector<32xf32>
+ }
+ "some_use"(%r) : (vector<2xf32>) -> ()
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
+// CHECK: %[[W]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
+// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
+// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x32xf32>
+// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x32xf32>, vector<16x16xf32>, vector<64x32xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
+// CHECK-SAME: {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
+// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x2xf32>) -> ()
+gpu.func @vector_insert_strided_slice_inner_distributed(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x2xf32>) {
+ %0 = "some_def"() : () -> (vector<16x16xf32>)
+ %1 = "some_def"() : () -> (vector<64x32xf32>)
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
}
- "some_user_op"(%r) : (vector<1x1xf32>) -> ()
- gpu.return
+ : vector<16x16xf32> into vector<64x32xf32>
+ gpu.yield %2 : vector<64x32xf32>
}
+ "some_use"(%r) : (vector<64x2xf32>) -> ()
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed
+// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3x32xf32>, vector<1x16xf32>, vector<3x32xf32>) {
+// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
+// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<48x32xf32>
+// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48x32xf32>, vector<16x16xf32>, vector<48x32xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
+// CHECK-SAME: {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
+// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<3x32xf32>) -> ()
+gpu.func @vector_insert_strided_slice_outer_distributed(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3x32xf32>) {
+ %0 = "some_def"() : () -> (vector<16x16xf32>)
+ %1 = "some_def"() : () -> (vector<48x32xf32>)
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
+ }
+ : vector<16x16xf32> into vector<48x32xf32>
+ gpu.yield %2 : vector<48x32xf32>
+ }
+ "some_use"(%r) : (vector<3x32xf32>) -> ()
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
+// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>, vector<1xf32>, vector<3xf32>) {
+// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32>
+// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<48xf32>
+// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48xf32>, vector<16xf32>, vector<48xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
+// CHECK-SAME: {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
+// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<3xf32>) -> ()
+gpu.func @vector_insert_strided_slice_1d(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
+ %0 = "some_def"() : () -> (vector<16xf32>)
+ %1 = "some_def"() : () -> (vector<48xf32>)
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<16xf32> into vector<48xf32>
+ gpu.yield %2 : vector<48xf32>
+ }
+ "some_use"(%r) : (vector<3xf32>) -> ()
+ gpu.return
+}
+
}
>From 4905450fc9011677993797c3dbda5550cc7c9e17 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Nov 2025 23:04:34 +0000
Subject: [PATCH 5/9] save work
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 23 +++++-------
.../XeGPU/subgroup-distribute-unit.mlir | 36 +++++++++++++++++++
2 files changed, 44 insertions(+), 15 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index a3e7f8469cd93..71df8d4fcbf7d 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1520,19 +1520,12 @@ struct VectorExtractStridedSliceDistribution
int distrDimExtractedSize =
cast<IntegerAttr>(extractOp.getSizes()[distributedDim]).getInt();
- if (distrDimExtractedSize ==
- extractOp.getSourceVectorType().getShape()[distributedDim])
+ int sourceDistrDimSize =
+ extractOp.getSourceVectorType().getShape()[distributedDim];
+ if (distrDimExtractedSize == sourceDistrDimSize)
return rewriter.notifyMatchFailure(
warpOp, "Distributed dimension is fully extracted, skipping.");
- // Check if the size extracted along the distributed dimension is a multiple
- // of the source dim size and should be distributable to lanes.
- int64_t sourceDisrDimSize = yieldedType.getShape()[distributedDim];
- if (sourceDisrDimSize % distrDimExtractedSize != 0)
- return rewriter.notifyMatchFailure(
- warpOp,
- "Extracted size along distributed dimension is not a multiple of "
- "source dim size.");
auto sourceLayout =
xegpu::getDistributeLayoutAttr(extractOp->getOpOperand(0));
if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
@@ -1543,13 +1536,13 @@ struct VectorExtractStridedSliceDistribution
// Because only single dimension distribution is supported, lane layout size
// at the distributed dim must be the subgroup size.
int subgroupSize = sourceLaneLayout[distributedDim];
- // Check if the distributed extracted dim is a multiple of the subgroup
- // size.
- if (distrDimExtractedSize % subgroupSize != 0)
+ // Check if the source size in the distributed dimension is a multiple of
+ // subgroup size.
+ if (sourceDistrDimSize % subgroupSize != 0)
return rewriter.notifyMatchFailure(
warpOp,
- "Extracted size along distributed dimension is not a multiple of "
- "subgroup size in source layout.");
+ "Source size along distributed dimension is not a multiple of "
+ "subgroup size.");
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
// We expect lane data to be all ones in this case.
if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index d8ed46646810d..4681b0958958c 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -749,4 +749,40 @@ gpu.func @vector_insert_strided_slice_1d(%laneid: index) {
gpu.return
}
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_offset
+// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK: }
+// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
+gpu.func @vector_extract_strided_slice_unsopported_offset(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<64xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [3], sizes = [32], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<64xf32> to vector<32xf32>
+ gpu.yield %1 : vector<32xf32>
+ }
+ "some_use"(%r) : (vector<2xf32>) -> ()
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_source
+// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK: }
+// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
+gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<54xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [0], sizes = [32], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<54xf32> to vector<32xf32>
+ gpu.yield %1 : vector<32xf32>
+ }
+ "some_use"(%r) : (vector<2xf32>) -> ()
+ gpu.return
+}
+
}
>From 36b27c44f1b697534a659e1f306d7d7c45c20a6e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Nov 2025 23:40:11 +0000
Subject: [PATCH 6/9] save work
---
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 4681b0958958c..4575a981e2986 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -675,7 +675,7 @@ gpu.func @vector_extract_strided_slice_1d(%laneid: index) {
}
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
-// CHECK: %[[W]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
+// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x32xf32>
// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x32xf32>, vector<16x16xf32>, vector<64x32xf32>
>From c1e9eb4f0ed1cdb9e940a6e1150fa8f3691f0465 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Nov 2025 23:43:05 +0000
Subject: [PATCH 7/9] save work
---
.../Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 71df8d4fcbf7d..7ecbd9226b43f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1485,6 +1485,10 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
}
};
+// Distribute a `vector.extract_strided_slice` op feeding into yield op of an
+// enclosing `gpu.warp_execute_on_lane_0` region. This pattern only handles
+// advanced cases where the distributed is partially extracted and currently not
+// supported by the generic vector distribution patterns.
struct VectorExtractStridedSliceDistribution
: public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
@@ -1592,6 +1596,10 @@ struct VectorExtractStridedSliceDistribution
}
};
+/// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
+/// enclosing `gpu.warp_execute_on_lane_0` region. This pattern only handles
+/// advanced cases where the distributed dimension is partially inserted and
+/// currently not supported by the generic vector distribution patterns.
struct VectorInsertStridedSliceDistribution
: public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
>From 8975d6a46ccc47bc017d2a3844e4b6ae43a5da51 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 18 Nov 2025 23:44:24 +0000
Subject: [PATCH 8/9] save work
---
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 7ecbd9226b43f..8e261a8f88d2b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1872,6 +1872,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
MemrefExtractAlignedPointerAsIndexDistribution>(
patterns.getContext(),
/*pattern benefit=*/regularPatternBenefit);
+ // For following patterns, we need to override the regular vector distribution
+ // patterns. Therefore, assign higher benefit.
patterns
.add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
VectorInsertStridedSliceDistribution>(
>From 2324fd3e5f8472ac0e52e02c70bc58ce324a2529 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 19 Nov 2025 18:32:34 +0000
Subject: [PATCH 9/9] add negative cases
---
.../Transforms/XeGPUSubgroupDistribute.cpp | 10 +-
.../XeGPU/subgroup-distribute-unit.mlir | 97 +++++++++++++------
2 files changed, 72 insertions(+), 35 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 8e261a8f88d2b..62904f13d61c8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1665,13 +1665,11 @@ struct VectorInsertStridedSliceDistribution
!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
return rewriter.notifyMatchFailure(
warpOp, "Expecting unit lane data in source and dest layouts");
- // Distributed dim sizes must be multiples of subgroup size.
- if (destDistrDimSize % subgroupSize != 0 ||
- srcDistrDimSize % subgroupSize != 0)
+ // Source distributed dim size must be multiples of subgroup size.
+ if (srcDistrDimSize % subgroupSize != 0)
return rewriter.notifyMatchFailure(
- warpOp,
- "Distributed dimension size in source or dest is not a multiple of "
- "subgroup size.");
+ warpOp, "Distributed dimension size in source is not a multiple of "
+ "subgroup size.");
// Offsets in the distributed dimension must be multiples of subgroup size.
int64_t destDistrDimOffset =
cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 4575a981e2986..93d9b1ea9904a 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -651,7 +651,6 @@ gpu.func @vector_extract_strided_slice_inner_distributed(%laneid: index) {
gpu.return
}
-
// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<4xf32>) {
// CHECK: %[[S:.*]] = "some_def"() : () -> vector<64xf32>
@@ -674,6 +673,42 @@ gpu.func @vector_extract_strided_slice_1d(%laneid: index) {
gpu.return
}
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_offset
+// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK: }
+// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
+gpu.func @vector_extract_strided_slice_unsopported_offset(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<64xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [3], sizes = [32], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<64xf32> to vector<32xf32>
+ gpu.yield %1 : vector<32xf32>
+ }
+ "some_use"(%r) : (vector<2xf32>) -> ()
+ gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_source
+// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK: }
+// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
+gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+ %0 = "some_def"() : () -> (vector<54xf32>)
+ %1 = vector.extract_strided_slice %0 { offsets = [0], sizes = [32], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<54xf32> to vector<32xf32>
+ gpu.yield %1 : vector<32xf32>
+ }
+ "some_use"(%r) : (vector<2xf32>) -> ()
+ gpu.return
+}
+
// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
@@ -749,39 +784,43 @@ gpu.func @vector_insert_strided_slice_1d(%laneid: index) {
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_offset
-// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK: }
-// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
-gpu.func @vector_extract_strided_slice_unsopported_offset(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %0 = "some_def"() : () -> (vector<64xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [3], sizes = [32], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<64xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_source
+// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) {
+// CHECK: }
+// CHECK-NOT: %{{.*}} = vector.insert_strided_slice
+gpu.func @vector_insert_strided_slice_unsupported_source(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
+ %0 = "some_def"() : () -> (vector<8xf32>)
+ %1 = "some_def"() : () -> (vector<48xf32>)
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<8xf32> into vector<48xf32>
+ gpu.yield %2 : vector<48xf32>
}
- "some_use"(%r) : (vector<2xf32>) -> ()
+ "some_use"(%r) : (vector<3xf32>) -> ()
gpu.return
}
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_source
-// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK: }
-// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
-gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %0 = "some_def"() : () -> (vector<54xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [0], sizes = [32], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<54xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
+// CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_offset
+// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) {
+// CHECK: }
+// CHECK-NOT: %{{.*}} = vector.insert_strided_slice
+gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
+ %0 = "some_def"() : () -> (vector<16xf32>)
+ %1 = "some_def"() : () -> (vector<48xf32>)
+ %2 = vector.insert_strided_slice %0, %1 { offsets = [3], strides = [1],
+ layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+ layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+ }
+ : vector<16xf32> into vector<48xf32>
+ gpu.yield %2 : vector<48xf32>
}
- "some_use"(%r) : (vector<2xf32>) -> ()
+ "some_use"(%r) : (vector<3xf32>) -> ()
gpu.return
}
More information about the Mlir-commits
mailing list