[Mlir-commits] [mlir] fe4c2bb - [mlir][xegpu] Deprecate XeGPUSubgroupDistribute and rename XeGPUSgToWiDistributeExperimental to XeGPUSgToLaneDistribute (#198027)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue May 26 07:52:52 PDT 2026
Author: Nishant Patel
Date: 2026-05-26T07:52:46-07:00
New Revision: fe4c2bb1b3d5626d7de8112e815b8548b20a81fe
URL: https://github.com/llvm/llvm-project/commit/fe4c2bb1b3d5626d7de8112e815b8548b20a81fe
DIFF: https://github.com/llvm/llvm-project/commit/fe4c2bb1b3d5626d7de8112e815b8548b20a81fe.diff
LOG: [mlir][xegpu] Deprecate XeGPUSubgroupDistribute and rename XeGPUSgToWiDistributeExperimental to XeGPUSgToLaneDistribute (#198027)
The XeGPUSubgroupDistribute pass is now fully superseded by the newer
subgroup-to-lane distribution flow, so this PR removes its
implementation & all associated tests.
The replacement pass XeGPUSgToWiDistributeExperimental is renamed to
XeGPUSgToLaneDistribute.
Added:
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
Modified:
mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
Removed:
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir
################################################################################
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 4bee1752b271e..90aa15f75f55f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -11,15 +11,6 @@
include "mlir/Pass/PassBase.td"
-def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
- let summary = "Distribute XeGPU ops to work items";
- let description = [{
- The pass distributes subgroup level (SIMD) XeGPU ops to work items.
- }];
- let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
- "vector::VectorDialect", "index::IndexDialect"];
-}
-
def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
let summary = "Propagate and assign XeGPU layout information";
let description = [{
@@ -109,10 +100,10 @@ def XeGPUPeepHoleOptimizer : Pass<"xegpu-optimize-peephole"> {
"vector::VectorDialect"];
}
-def XeGPUSgToWiDistributeExperimental : Pass<"xegpu-sg-to-wi-distribute-experimental"> {
- let summary = "Distribute XeGPU ops to work items";
+def XeGPUSgToLaneDistribute : Pass<"xegpu-sg-to-lane-distribute"> {
+ let summary = "Distribute XeGPU ops to lanes";
let description = [{
- The pass distributes subgroup level XeGPU ops to work item level XeGPU ops.
+ The pass distributes subgroup level XeGPU ops to lane level XeGPU ops.
}];
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
"vector::VectorDialect", "index::IndexDialect"];
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index a21866b5cc33f..919a69908bdce 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -65,21 +65,18 @@ struct UnrollOptions {
void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns);
/// Appends patterns for array length optimization into `patterns`.
void populateXeGPUArrayLengthOptimizationPatterns(RewritePatternSet &patterns);
-/// Appends patterns for XeGPU SIMT distribution into `patterns`.
-void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
-/// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
-void populateXeGPUMoveFuncBodyToWarpOpPatterns(RewritePatternSet &patterns);
/// Appends patterns for XeGPU workgroup to subgroup distribution into
/// `patterns`.
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns);
-/// Define only the type conversions needed for XeGPU subgroup to workitem
+/// Define only the type conversions needed for XeGPU subgroup to lane
/// distribution.
-void populateXeGPUSgToWiDistributeTypeConversions(TypeConverter &typeConverter);
-/// Defines type conversions and legality for XeGPU subgroup to workitem
+void populateXeGPUSgToLaneDistributeTypeConversions(
+ TypeConverter &typeConverter);
+/// Defines type conversions and legality for XeGPU subgroup to lane
/// distribution and appends the required conversion patterns into `patterns`.
-/// Appends patterns for XeGPU subgroup to workitem distribution into
+/// Appends patterns for XeGPU subgroup to lane distribution into
/// `patterns`.
-void populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+void populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
ConversionTarget &target);
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index ec5591aa85613..fa0ab712fa64d 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -91,8 +91,7 @@ void buildGPUPassPipeline(OpPassManager &pm,
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(
xegpu::createXeGPUPropagateLayout(laneLayoutOptions));
- pm.addNestedPass<gpu::GPUModuleOp>(
- xegpu::createXeGPUSgToWiDistributeExperimental());
+ pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSgToLaneDistribute());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 0e30a6ee6e3f0..37922f7ef7d24 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,8 +1,7 @@
add_mlir_dialect_library(MLIRXeGPUTransforms
XeGPUArrayLengthOptimization.cpp
XeGPUBlocking.cpp
- XeGPUSgToWiDistributeExperimental.cpp
- XeGPUSubgroupDistribute.cpp
+ XeGPUSgToLaneDistribute.cpp
XeGPUUnroll.cpp
XeGPUWgToSgDistribute.cpp
XeGPUPropagateLayout.cpp
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
similarity index 90%
rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
index 2af5429395526..8a926754e7cfb 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToWiDistributeExperimental.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSgToLaneDistribute.cpp
@@ -1,4 +1,4 @@
-//===- XeGPUSgToWiDistributeExperimental.cpp - XeGPU SG to WI Pass --------===//
+//===- XeGPUSgToLaneDistribute.cpp - XeGPU SG to Lane Pass ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -33,14 +33,14 @@
namespace mlir {
namespace xegpu {
-#define GEN_PASS_DEF_XEGPUSGTOWIDISTRIBUTEEXPERIMENTAL
+#define GEN_PASS_DEF_XEGPUSGTOLANEDISTRIBUTE
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir
using namespace mlir;
-#define DEBUG_TYPE "xegpu-sg-to-wi-distribute-experimental"
+#define DEBUG_TYPE "xegpu-sg-to-lane-distribute"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
namespace {
@@ -84,7 +84,7 @@ static bool isValidSubgroupMultiReductionOp(vector::MultiDimReductionOp op) {
return op.getReductionDims().size() == 1;
}
-/// A vector::MultiDimReductionOp is doing lane-local reduction if each workitem
+/// A vector::MultiDimReductionOp is doing lane-local reduction if each lane
/// is doing its own local reduction. In this case the result layout ensures
/// that result vector is distributed to lanes, i.e. the result vector type is
///
diff erent from the distributed result vector type.
@@ -112,9 +112,10 @@ static SmallVector<int64_t> getDistributedDims(VectorType originalType,
return distributedDims;
}
-/// Distributes a subgroup-level CreateNdDesc op to workitem-level CreateNdDesc
+/// Distributes a subgroup-level CreateNdDesc op to lane-level CreateNdDesc
/// op. This simply drops the layout attribute from the tensor descriptor type.
-struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
+struct SgToLaneCreateNdDesc
+ : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
LogicalResult
@@ -133,10 +134,10 @@ struct SgToWiCreateNdDesc : public OpConversionPattern<xegpu::CreateNdDescOp> {
}
};
-/// Distributes a subgroup-level LoadNd op to workitem-level LoadNd op. Output
-/// of workitem-level LoadNd op is 1D. ShapeCast is added to restore the
+/// Distributes a subgroup-level LoadNd op to lane-level LoadNd op. Output
+/// of lane-level LoadNd op is 1D. ShapeCast is added to restore the
/// original rank.
-struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
+struct SgToLaneLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
LogicalResult
@@ -157,19 +158,18 @@ struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
op, "xegpu::LoadNdOp require target attribute attached to "
"determine transpose "
"requirement");
- auto supportedWiResultTyOrFailure =
+ auto supportedLaneResultTyOrFailure =
xegpu::getDistributedVectorType(op.getTensorDescType());
- auto expectedWiResultTyOrFailure =
+ auto expectedLaneResultTyOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(layout, op.getType());
- if (failed(supportedWiResultTyOrFailure))
+ if (failed(supportedLaneResultTyOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute the workitem vector type for LoadNdOp");
- if (failed(expectedWiResultTyOrFailure))
+ op, "unable to compute the lane vector type for LoadNdOp");
+ if (failed(expectedLaneResultTyOrFailure))
return rewriter.notifyMatchFailure(
- op,
- "unable to compute expected workitem vector type from lane layout");
+ op, "unable to compute expected lane vector type from lane layout");
auto newOp = xegpu::LoadNdOp::create(
- rewriter, op.getLoc(), supportedWiResultTyOrFailure.value(),
+ rewriter, op.getLoc(), supportedLaneResultTyOrFailure.value(),
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getPackedAttr(),
op.getTransposeAttr(), op.getL1HintAttr(), op.getL2HintAttr(),
op.getL3HintAttr(), /**layout**/ nullptr);
@@ -179,15 +179,15 @@ struct SgToWiLoadNd : public OpConversionPattern<xegpu::LoadNdOp> {
if (xegpu::requireTranspose(cast<xegpu::LayoutAttr>(layout), uArch))
newOp.setTranspose(DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
- expectedWiResultTyOrFailure.value()));
+ expectedLaneResultTyOrFailure.value()));
return success();
}
};
-/// Distributes a subgroup-level StoreNd op to workitem-level StoreNd op. Stored
-/// value in workitem-level StoreNd op is 1D. ShapeCast is added to cast the
+/// Distributes a subgroup-level StoreNd op to lane-level StoreNd op. Stored
+/// value in lane-level StoreNd op is 1D. ShapeCast is added to cast the
/// incoming value to 1D.
-struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
+struct SgToLaneStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
using OpConversionPattern<xegpu::StoreNdOp>::OpConversionPattern;
LogicalResult
@@ -206,18 +206,18 @@ struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
if (valueLayout != layout)
return rewriter.notifyMatchFailure(
op, "conflicting layout attributes on value and anchor");
- auto supportedWiValueTyOrFailure =
+ auto supportedLaneValueTyOrFailure =
xegpu::getDistributedVectorType(op.getTensorDescType());
- if (failed(supportedWiValueTyOrFailure))
+ if (failed(supportedLaneValueTyOrFailure))
return rewriter.notifyMatchFailure(
op,
- "unable to compute wi vector type for StoreNdOp value from tensor "
+ "unable to compute lane vector type for StoreNdOp value from tensor "
"descriptor");
xegpu::StoreNdOp::create(
rewriter, op.getLoc(),
castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getValue()),
- supportedWiValueTyOrFailure.value()),
+ supportedLaneValueTyOrFailure.value()),
adaptor.getTensorDesc(), op.getMixedOffsets(), op.getL1HintAttr(),
op.getL2HintAttr(), op.getL3HintAttr(), /**layout**/ nullptr);
rewriter.eraseOp(op);
@@ -225,10 +225,10 @@ struct SgToWiStoreNd : public OpConversionPattern<xegpu::StoreNdOp> {
}
};
-/// Distributes a subgroup-level Dpas op to workitem-level Dpas op. All inpputs
-/// and output of workitem-level Dpas op are 1D. Necessary casts are added to
+/// Distributes a subgroup-level Dpas op to lane-level Dpas op. All inpputs
+/// and output of lane-level Dpas op are 1D. Necessary casts are added to
/// convert the inputs and output to/from 1D.
-struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
+struct SgToLaneDpas : public OpConversionPattern<xegpu::DpasOp> {
using OpConversionPattern<xegpu::DpasOp>::OpConversionPattern;
LogicalResult
@@ -240,22 +240,22 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
auto layoutCd = cast<xegpu::LayoutAttr>(op.getLayoutCdAttr());
if (!layoutA || !layoutB || !layoutCd)
return failure();
- auto wiResultTyOrFailure =
+ auto laneResultTyOrFailure =
xegpu::getDistributedVectorType(op.getType(), layoutCd);
- auto wiATypeOrFailure =
+ auto laneATypeOrFailure =
xegpu::getDistributedVectorType(op.getLhs().getType(), layoutA);
- auto wiBTypeOrFailure =
+ auto laneBTypeOrFailure =
xegpu::getDistributedVectorType(op.getRhs().getType(), layoutB);
- auto expectedWiResultTyOrFailure =
+ auto expectedLaneResultTyOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(layoutCd, op.getType());
- if (failed(wiResultTyOrFailure) || failed(wiATypeOrFailure) ||
- failed(wiBTypeOrFailure))
+ if (failed(laneResultTyOrFailure) || failed(laneATypeOrFailure) ||
+ failed(laneBTypeOrFailure))
return rewriter.notifyMatchFailure(
- op, "failed to calculate supported workitem vector types for DpasOp "
+ op, "failed to calculate supported lane vector types for DpasOp "
"from layouts");
- if (failed(expectedWiResultTyOrFailure))
+ if (failed(expectedLaneResultTyOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute expected workitem vector type for DpasOp from "
+ op, "unable to compute expected lane vector type for DpasOp from "
"lane layout");
// Validate bit widths match uArch packed format requirements
@@ -266,13 +266,13 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
uArch->getInstruction(
xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc));
if (uArchInstruction) {
- auto wiAType = wiATypeOrFailure.value();
- auto wiBType = wiBTypeOrFailure.value();
+ auto laneAType = laneATypeOrFailure.value();
+ auto laneBType = laneBTypeOrFailure.value();
// Calculate total packed bit width = element bit width * vector size
unsigned aPackedBitWidth =
- wiAType.getElementTypeBitWidth() * wiAType.getNumElements();
+ laneAType.getElementTypeBitWidth() * laneAType.getNumElements();
unsigned bPackedBitWidth =
- wiBType.getElementTypeBitWidth() * wiBType.getNumElements();
+ laneBType.getElementTypeBitWidth() * laneBType.getNumElements();
unsigned expectedABitSize = uArchInstruction->getPackedFormatBitSizeA();
unsigned expectedBBitSize = uArchInstruction->getPackedFormatBitSizeB();
@@ -290,26 +290,26 @@ struct SgToWiDpas : public OpConversionPattern<xegpu::DpasOp> {
}
auto newOp = xegpu::DpasOp::create(
- rewriter, op->getLoc(), wiResultTyOrFailure.value(),
+ rewriter, op->getLoc(), laneResultTyOrFailure.value(),
castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getLhs()),
- wiATypeOrFailure.value()),
+ laneATypeOrFailure.value()),
castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getRhs()),
- wiBTypeOrFailure.value()),
+ laneBTypeOrFailure.value()),
castValueTo(rewriter, cast<TypedValue<VectorType>>(adaptor.getAcc()),
- wiResultTyOrFailure.value()),
+ laneResultTyOrFailure.value()),
/** layoutA**/ nullptr,
/** layoutB**/ nullptr, /** layoutCd**/ nullptr);
// Explicitly set the new types to enable correct type materializations.
rewriter.replaceOp(op, castValueTo(rewriter, newOp.getResult(),
- expectedWiResultTyOrFailure.value()));
+ expectedLaneResultTyOrFailure.value()));
return success();
}
};
-/// Distributes elementwise ops to workitem-level elementwise ops. This
+/// Distributes elementwise ops to lane-level elementwise ops. This
/// currently handles elementwise ops with single result only.
-struct SgToWiElementWise : public ConversionPattern {
- SgToWiElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
+struct SgToLaneElementWise : public ConversionPattern {
+ SgToLaneElementWise(TypeConverter &typeConverter, MLIRContext *ctx)
: ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
LogicalResult
@@ -330,14 +330,14 @@ struct SgToWiElementWise : public ConversionPattern {
return rewriter.notifyMatchFailure(
op, "operation result does not have subgroup distribute layout");
- auto wiShapeOrFailure =
+ auto laneShapeOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
- if (failed(wiShapeOrFailure))
+ if (failed(laneShapeOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute workitem vector type from the layout");
+ op, "unable to compute lane vector type from the layout");
- VectorType newResultType = wiShapeOrFailure.value();
+ VectorType newResultType = laneShapeOrFailure.value();
OperationState state(op->getLoc(), op->getName());
state.addOperands(operands);
state.addTypes(newResultType);
@@ -353,9 +353,9 @@ struct SgToWiElementWise : public ConversionPattern {
}
};
-/// Distributes a subgroup-level arith ConstantOp to workitem-level arith
+/// Distributes a subgroup-level arith ConstantOp to lane-level arith
/// ConstantOp.
-struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
+struct SgToLaneArithConstant : public OpConversionPattern<arith::ConstantOp> {
using OpConversionPattern<arith::ConstantOp>::OpConversionPattern;
LogicalResult
@@ -377,14 +377,14 @@ struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
return rewriter.notifyMatchFailure(
op, "operation result does not have subgroup distribute layout");
- auto wiShapeOrFailure =
+ auto laneShapeOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(layout, resultType);
- if (failed(wiShapeOrFailure))
+ if (failed(laneShapeOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute workitem vector type from the layout");
+ op, "unable to compute lane vector type from the layout");
- VectorType newResultType = wiShapeOrFailure.value();
+ VectorType newResultType = laneShapeOrFailure.value();
auto sclarValue = dense.getSplatValue<Attribute>();
auto newDenseAttr = DenseElementsAttr::get(newResultType, sclarValue);
@@ -395,8 +395,8 @@ struct SgToWiArithConstant : public OpConversionPattern<arith::ConstantOp> {
}
};
-/// Distributes a subgroup-level PrefetchNd op to workitem-level PrefetchNd op.
-struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
+/// Distributes a subgroup-level PrefetchNd op to lane-level PrefetchNd op.
+struct SgToLanePrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
using OpConversionPattern<xegpu::PrefetchNdOp>::OpConversionPattern;
LogicalResult
@@ -416,7 +416,7 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
}
};
-/// Distributes a subgroup-level LoadGather (xegpu.load) op to workitem-level.
+/// Distributes a subgroup-level LoadGather (xegpu.load) op to lane-level.
///
/// Example 1 (1D, no chunk size):
/// layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
@@ -449,7 +449,7 @@ struct SgToWiPrefetchNd : public OpConversionPattern<xegpu::PrefetchNdOp> {
/// %offset = producer_op : vector<1x1x1xindex>
/// %0 = xegpu.load %src[%offset], %mask : memref<256xf16>,
/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
-struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
+struct SgToLaneLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
using OpConversionPattern<xegpu::LoadGatherOp>::OpConversionPattern;
LogicalResult
@@ -478,8 +478,7 @@ struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
xegpu::getDistVecTypeBasedOnLaneLayout(layout, origResultTy);
if (failed(distResultTyOrFailure))
return rewriter.notifyMatchFailure(
- op,
- "unable to compute expected workitem vector type from lane layout");
+ op, "unable to compute expected lane vector type from lane layout");
VectorType distResultTy = distResultTyOrFailure.value();
VectorType distResultTy1D = VectorType::get({distResultTy.getNumElements()},
@@ -516,10 +515,11 @@ struct SgToWiLoadGather : public OpConversionPattern<xegpu::LoadGatherOp> {
};
/// This pattern distributes a subgroup-level vector.reduction op to
-/// workitem-level. This require shuffling the data across the workitems (using
-/// gpu::ShuffleOp) and reducing in stages until all workitems have the final
+/// lane-level. This require shuffling the data across the lanes (using
+/// gpu::ShuffleOp) and reducing in stages until all lanes have the final
/// result.
-struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
+struct SgToLaneVectorReduction
+ : public OpConversionPattern<vector::ReductionOp> {
using OpConversionPattern<vector::ReductionOp>::OpConversionPattern;
LogicalResult
@@ -561,10 +561,10 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
op, "Reduction distribution currently only supports floats and "
"integer types.");
- // Get the distributed vector (per work-item portion).
+ // Get the distributed vector (per lane portion).
Value laneValVec = adaptor.getVector();
- // Distribute and reduce across work-items in the subgroup.
+ // Distribute and reduce across lanes in the subgroup.
Value fullReduce = xegpu::subgroupReduction(
op.getLoc(), rewriter, laneValVec, op.getKind(), sgSize);
@@ -579,10 +579,10 @@ struct SgToWiVectorReduction : public OpConversionPattern<vector::ReductionOp> {
};
/// This pattern distributes a subgroup-level vector.multi_reduction op to
-/// workitem-level only if the reduction is lane-local. This means that
+/// lane-level only if the reduction is lane-local. This means that
/// reduction dimension is not distributed to lanes and each lane does its own
/// local reduction.
-struct SgToWiMultiDimReduction
+struct SgToLaneMultiDimReduction
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
@@ -643,7 +643,7 @@ struct SgToWiMultiDimReduction
};
/// Helper to compute distributed coordinates for matrix ops.
-/// When not using subgroup_block_io, each workitem computes its own
+/// When not using subgroup_block_io, each lane computes its own
/// coordinates based on the layout and lane ID.
static SmallVector<Value> computeDistributedCoordsForMatrixOp(
ConversionPatternRewriter &rewriter, Location loc,
@@ -663,8 +663,8 @@ static SmallVector<Value> computeDistributedCoordsForMatrixOp(
return llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
}
-/// This pattern distributes a subgroup-level LoadMatrix op to workitem-level.
-struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
+/// This pattern distributes a subgroup-level LoadMatrix op to lane-level.
+struct SgToLaneLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
using OpConversionPattern<xegpu::LoadMatrixOp>::OpConversionPattern;
LogicalResult
@@ -717,8 +717,9 @@ struct SgToWiLoadMatrix : public OpConversionPattern<xegpu::LoadMatrixOp> {
}
};
-/// Distributes a subgroup-level vector.transpose op to workitem-level.
-struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
+/// Distributes a subgroup-level vector.transpose op to lane-level.
+struct SgToLaneVectorTranspose
+ : public OpConversionPattern<vector::TransposeOp> {
using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
LogicalResult
@@ -753,9 +754,9 @@ struct SgToWiVectorTranspose : public OpConversionPattern<vector::TransposeOp> {
}
};
-/// Distributes a subgroup-level vector.bitcast op to workitem-level.
+/// Distributes a subgroup-level vector.bitcast op to lane-level.
/// Bitcast only impacts the innermost dimension of the source/result vectors.
-struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
+struct SgToLaneVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
using OpConversionPattern<vector::BitCastOp>::OpConversionPattern;
LogicalResult
@@ -781,8 +782,8 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
};
/// Distributes a subgroup-level vector.create_mask or vector.constant_mask op
-/// to workitem-level. Uses `computeDistributedCoords()` to obtain the
-/// coordinates each workitem owns, then compares each coordinate against the
+/// to lane-level. Uses `computeDistributedCoords()` to obtain the
+/// coordinates each lane owns, then compares each coordinate against the
/// original mask bounds using `arith.cmpi slt`. The per-element boolean
/// results are assembled into the distributed mask vector.
///
@@ -806,7 +807,7 @@ struct SgToWiVectorBitcast : public OpConversionPattern<vector::BitCastOp> {
template <typename OpType,
typename = std::enable_if_t<llvm::is_one_of<
OpType, vector::CreateMaskOp, vector::ConstantMaskOp>::value>>
-struct SgToWiCreateMask : public OpConversionPattern<OpType> {
+struct SgToLaneCreateMask : public OpConversionPattern<OpType> {
using OpConversionPattern<OpType>::OpConversionPattern;
LogicalResult
@@ -823,7 +824,7 @@ struct SgToWiCreateMask : public OpConversionPattern<OpType> {
getDistVecTypeBasedOnLaneLayout(layout, origType);
if (failed(distTypeOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute workitem vector type from the layout");
+ op, "unable to compute lane vector type from the layout");
VectorType distType = distTypeOrFailure.value();
Location loc = op.getLoc();
@@ -884,8 +885,8 @@ struct SgToWiCreateMask : public OpConversionPattern<OpType> {
}
};
-/// This pattern distributes a subgroup-level StoreMatrix op to workitem-level.
-struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
+/// This pattern distributes a subgroup-level StoreMatrix op to lane-level.
+struct SgToLaneStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
using OpConversionPattern<xegpu::StoreMatrixOp>::OpConversionPattern;
LogicalResult
@@ -941,7 +942,7 @@ struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
};
/// Distributes a subgroup-level StoreScatter (xegpu.store) op to
-/// workitem-level.
+/// lane-level.
///
/// Example 1 (1D, no chunk size):
/// layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
@@ -974,7 +975,8 @@ struct SgToWiStoreMatrix : public OpConversionPattern<xegpu::StoreMatrixOp> {
/// %offset = producer_op : vector<1x1x1xindex>
/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
/// memref<256xf16>, vector<1xindex>, vector<1xi1>
-struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
+struct SgToLaneStoreScatter
+ : public OpConversionPattern<xegpu::StoreScatterOp> {
using OpConversionPattern<xegpu::StoreScatterOp>::OpConversionPattern;
LogicalResult
@@ -1002,8 +1004,7 @@ struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
xegpu::getDistVecTypeBasedOnLaneLayout(layout, origValueTy);
if (failed(distValueTyOrFailure))
return rewriter.notifyMatchFailure(
- op,
- "unable to compute expected workitem vector type from lane layout");
+ op, "unable to compute expected lane vector type from lane layout");
VectorType distValueTy = distValueTyOrFailure.value();
VectorType distValueTy1D = VectorType::get({distValueTy.getNumElements()},
@@ -1039,11 +1040,11 @@ struct SgToWiStoreScatter : public OpConversionPattern<xegpu::StoreScatterOp> {
}
};
-/// Distribute a vector::StepOp to workitem-level.
+/// Distribute a vector::StepOp to lane-level.
/// The layout must have exactly 1 effective lane dimension.
/// We completely resolve the vector::StepOp by computing the lane_data-sized
/// subranges.
-struct SgToWiVectorStep : public OpConversionPattern<vector::StepOp> {
+struct SgToLaneVectorStep : public OpConversionPattern<vector::StepOp> {
using OpConversionPattern<vector::StepOp>::OpConversionPattern;
LogicalResult
@@ -1057,12 +1058,12 @@ struct SgToWiVectorStep : public OpConversionPattern<vector::StepOp> {
auto loc = op.getLoc();
auto stepResultVecTy = op.getResult().getType();
- auto wiShapeOrFailure =
+ auto laneShapeOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(resultLayout, stepResultVecTy);
- if (failed(wiShapeOrFailure))
+ if (failed(laneShapeOrFailure))
return rewriter.notifyMatchFailure(
- op, "unable to compute workitem vector type from the layout");
- VectorType newVecTy = wiShapeOrFailure.value();
+ op, "unable to compute lane vector type from the layout");
+ VectorType newVecTy = laneShapeOrFailure.value();
Value laneId = gpu::LaneIdOp::create(rewriter, loc, rewriter.getIndexType(),
/*upperBound=*/mlir::IntegerAttr());
@@ -1103,9 +1104,9 @@ struct SgToWiVectorStep : public OpConversionPattern<vector::StepOp> {
}
};
-/// Distributes a subgroup-level vector.extract op to workitem-level. Only
+/// Distributes a subgroup-level vector.extract op to lane-level. Only
/// handles sub-vector extraction (result is VectorType, not scalar).
-struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
+struct SgToLaneVectorExtract : public OpConversionPattern<vector::ExtractOp> {
using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
LogicalResult
@@ -1137,8 +1138,9 @@ struct SgToWiVectorExtract : public OpConversionPattern<vector::ExtractOp> {
}
};
-/// This pattern distributes a subgroup-level ShapeCast op to workitem-level.
-struct SgToWiVectorShapeCast : public OpConversionPattern<vector::ShapeCastOp> {
+/// This pattern distributes a subgroup-level ShapeCast op to lane-level.
+struct SgToLaneVectorShapeCast
+ : public OpConversionPattern<vector::ShapeCastOp> {
using OpConversionPattern<vector::ShapeCastOp>::OpConversionPattern;
LogicalResult
@@ -1165,9 +1167,9 @@ struct SgToWiVectorShapeCast : public OpConversionPattern<vector::ShapeCastOp> {
};
/// Distributes a subgroup-level vector.extract_strided_slice op to
-/// workitem-level. If the result is distributed, the offsets and sizes are
+/// lane-level. If the result is distributed, the offsets and sizes are
/// adjusted to match the distributed types.
-struct SgToWiVectorExtractStridedSlice
+struct SgToLaneVectorExtractStridedSlice
: public OpConversionPattern<vector::ExtractStridedSliceOp> {
using OpConversionPattern<vector::ExtractStridedSliceOp>::OpConversionPattern;
@@ -1257,7 +1259,7 @@ struct SgToWiVectorExtractStridedSlice
};
/// This pattern distributes a subgroup-level `vector.broadcast` op to
-/// workitem-level. The pattern supports three cases:
+/// lane-level. The pattern supports three cases:
///
/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
/// vector must have a slice layout of the result. If the distributed source
@@ -1313,7 +1315,7 @@ struct SgToWiVectorExtractStridedSlice
/// %0 = "some_op"() : f16
/// %1 = vector.broadcast %0 : f16 to vector<16x1xf16>
/// ```
-struct SgToWiBroadcast : public OpConversionPattern<vector::BroadcastOp> {
+struct SgToLaneBroadcast : public OpConversionPattern<vector::BroadcastOp> {
using OpConversionPattern<vector::BroadcastOp>::OpConversionPattern;
LogicalResult
@@ -1376,9 +1378,9 @@ struct SgToWiBroadcast : public OpConversionPattern<vector::BroadcastOp> {
};
/// Distributes a subgroup-level vector.insert_strided_slice op to
-/// workitem-level. If the dest is distributed, the offsets are adjusted to
+/// lane-level. If the dest is distributed, the offsets are adjusted to
/// match the distributed types.
-struct SgToWiVectorInsertStridedSlice
+struct SgToLaneVectorInsertStridedSlice
: public OpConversionPattern<vector::InsertStridedSliceOp> {
using OpConversionPattern<vector::InsertStridedSliceOp>::OpConversionPattern;
@@ -1470,9 +1472,9 @@ struct SgToWiVectorInsertStridedSlice
}
};
-/// Distributes a subgroup-level vector.insert op to workitem-level. Only
+/// Distributes a subgroup-level vector.insert op to lane-level. Only
/// handles sub-vector insertion (value to store is VectorType, not scalar).
-struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
+struct SgToLaneVectorInsert : public OpConversionPattern<vector::InsertOp> {
using OpConversionPattern<vector::InsertOp>::OpConversionPattern;
LogicalResult
@@ -1506,7 +1508,7 @@ struct SgToWiVectorInsert : public OpConversionPattern<vector::InsertOp> {
};
/// Folds a subgroup-level ConvertLayout op with compatible lane layouts.
-struct SgToWiConvertLayout
+struct SgToLaneConvertLayout
: public OpConversionPattern<xegpu::ConvertLayoutOp> {
using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
@@ -1536,7 +1538,7 @@ struct SgToWiConvertLayout
};
// Trivially distribute `vector.interleave`
-struct SgToWiVectorInterleave
+struct SgToLaneVectorInterleave
: public OpConversionPattern<vector::InterleaveOp> {
using OpConversionPattern<vector::InterleaveOp>::OpConversionPattern;
@@ -1552,7 +1554,7 @@ struct SgToWiVectorInterleave
};
// Trivially distribute `vector.deinterleave`
-struct SgToWiVectorDeinterleave
+struct SgToLaneVectorDeinterleave
: public OpConversionPattern<vector::DeinterleaveOp> {
using OpConversionPattern<vector::DeinterleaveOp>::OpConversionPattern;
@@ -1567,7 +1569,7 @@ struct SgToWiVectorDeinterleave
}
};
-struct SgToWiDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {
+struct SgToLaneDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {
using OpConversionPattern<xegpu::DpasMxOp>::OpConversionPattern;
LogicalResult
@@ -1682,15 +1684,15 @@ struct SgToWiDpasMx : public OpConversionPattern<xegpu::DpasMxOp> {
}
};
-struct XeGPUSgToWiDistributeExperimentalPass
- : public xegpu::impl::XeGPUSgToWiDistributeExperimentalBase<
- XeGPUSgToWiDistributeExperimentalPass> {
+struct XeGPUSgToLaneDistributePass
+ : public xegpu::impl::XeGPUSgToLaneDistributeBase<
+ XeGPUSgToLaneDistributePass> {
void runOnOperation() override;
};
} // namespace
-void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
+void XeGPUSgToLaneDistributePass::runOnOperation() {
// Recover temporary operand layouts for usage in patterns.
Operation *root = getOperation();
@@ -1719,17 +1721,17 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
RewritePatternSet patterns(&getContext());
typeConverter.addSourceMaterialization(materializeCast);
typeConverter.addTargetMaterialization(materializeCast);
- xegpu::populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+ xegpu::populateXeGPUSgToLaneDistributeTypeConversions(typeConverter);
scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
patterns, target);
- xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
typeConverter, patterns, target);
target.addLegalOp<UnrealizedConversionCastOp>();
(void)applyPartialConversion(root, target, std::move(patterns));
}
// Structural type conversion can generate some redundant
// UnrealizedConversionCastOps to materialize the SG type from type converted
- // WI type. These are redundant at this point and can be eliminated by
+ // lane type. These are redundant at this point and can be eliminated by
// inserting shape casts instead.
// Example:
// %1 = UnrealizedConversionCastOp %0 : vector<16x1xf32> to vector<16x16xf32>
@@ -1790,7 +1792,7 @@ void XeGPUSgToWiDistributeExperimentalPass::runOnOperation() {
xegpu::removeTemporaryLayoutAttrs(getOperation());
}
-void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
+void xegpu::populateXeGPUSgToLaneDistributeTypeConversions(
TypeConverter &typeConverter) {
// Any type other than TensorDescType and VectorType are legal as is.
typeConverter.addConversion([](Type type) -> std::optional<Type> {
@@ -1824,10 +1826,10 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversions(
});
}
-void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+void xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
TypeConverter &typeConverter, RewritePatternSet &patterns,
ConversionTarget &target) {
- populateXeGPUSgToWiDistributeTypeConversions(typeConverter);
+ populateXeGPUSgToLaneDistributeTypeConversions(typeConverter);
// CreateNdDescOp is legal only if its result type has no layout attribute.
target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
[&](xegpu::CreateNdDescOp op) { return !op.getType().getLayoutAttr(); });
@@ -1914,16 +1916,17 @@ void xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
return !xegpu::getTemporaryLayout(op->getOpResult(0));
});
target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
- patterns.add<SgToWiCreateNdDesc, SgToWiLoadNd, SgToWiStoreNd, SgToWiDpas,
- SgToWiElementWise, SgToWiArithConstant, SgToWiPrefetchNd,
- SgToWiLoadGather, SgToWiStoreScatter, SgToWiVectorReduction,
- SgToWiMultiDimReduction, SgToWiVectorExtract, SgToWiVectorInsert,
- SgToWiVectorExtractStridedSlice, SgToWiVectorInsertStridedSlice,
- SgToWiLoadMatrix, SgToWiStoreMatrix, SgToWiConvertLayout,
- SgToWiVectorTranspose, SgToWiVectorBitcast, SgToWiVectorStep,
- SgToWiVectorShapeCast, SgToWiBroadcast,
- SgToWiCreateMask<vector::CreateMaskOp>,
- SgToWiCreateMask<vector::ConstantMaskOp>,
- SgToWiVectorDeinterleave, SgToWiVectorInterleave, SgToWiDpasMx>(
- typeConverter, patterns.getContext());
+ patterns.add<
+ SgToLaneCreateNdDesc, SgToLaneLoadNd, SgToLaneStoreNd, SgToLaneDpas,
+ SgToLaneElementWise, SgToLaneArithConstant, SgToLanePrefetchNd,
+ SgToLaneLoadGather, SgToLaneStoreScatter, SgToLaneVectorReduction,
+ SgToLaneMultiDimReduction, SgToLaneVectorExtract, SgToLaneVectorInsert,
+ SgToLaneVectorExtractStridedSlice, SgToLaneVectorInsertStridedSlice,
+ SgToLaneLoadMatrix, SgToLaneStoreMatrix, SgToLaneConvertLayout,
+ SgToLaneVectorTranspose, SgToLaneVectorBitcast, SgToLaneVectorStep,
+ SgToLaneVectorShapeCast, SgToLaneBroadcast,
+ SgToLaneCreateMask<vector::CreateMaskOp>,
+ SgToLaneCreateMask<vector::ConstantMaskOp>, SgToLaneVectorDeinterleave,
+ SgToLaneVectorInterleave, SgToLaneDpasMx>(typeConverter,
+ patterns.getContext());
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
deleted file mode 100644
index 1b4dddcb4ae55..0000000000000
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ /dev/null
@@ -1,2280 +0,0 @@
-//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/Affine/Utils.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
-#include "mlir/Dialect/Index/IR/IndexDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
-#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
-#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
-#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
-#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
-#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeRange.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/InliningUtils.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SmallVectorExtras.h"
-
-namespace mlir {
-namespace xegpu {
-#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
-#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
-} // namespace xegpu
-} // namespace mlir
-
-#define DEBUG_TYPE "xegpu-subgroup-distribute"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-
-using namespace mlir;
-
-static const char *const resolveSIMTTypeMismatch =
- "resolve_simt_type_mismatch"; // Attribute name for identifying
- // UnrelizedConversionCastOp added to resolve
- // SIMT type mismatches.
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// SIMT Distribution Patterns
-//===----------------------------------------------------------------------===//
-
-/// In certain cases, we may need to favor XeGPU specific distribution patterns
-/// over generic vector distribution patterns. In such cases, we can assign
-/// priorities to patterns.
-enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };
-
-/// Helper function to resolve types if the distributed type out of
-/// gpu.warp_execute_on_lane0 is
diff erent from the expected xegpu SIMT type.
-/// Example 1:
-/// distributed type: vector<8x1xf32>
-/// expected type: vector<8xf32>
-/// resolved using,
-/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
-/// Example 2:
-/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
-/// expected type: xegpu.tensor_desc<8x16xf32>
-/// resolved using,
-/// %0 = unrealized_conversion_cast %1 :
-/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
-/// xegpu.tensor_desc<8x16xf32>
-template <typename T>
-static Value resolveDistributedTy(Value orig, T expected,
- PatternRewriter &rewriter) {
- // If orig and expected types are the same, return orig.
- if (orig.getType() == expected)
- return orig;
- // If orig is a vector type, create a shape cast op to reconcile the types.
- if (isa<VectorType>(orig.getType())) {
- auto castOp =
- vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
- return castOp.getResult();
- }
- // If orig is a tensor descriptor type, create an unrealized conversion cast
- // op to reconcile the types.
- if (isa<xegpu::TensorDescType>(orig.getType())) {
- auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
- expected, orig);
- castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
- return castOp.getResult(0);
- }
- llvm_unreachable("Unsupported type for reconciliation");
- return orig;
-}
-
-/// Given a vector type and its distributed vector type, return the list of
-/// dimensions that are distributed.
-static SmallVector<int64_t> getDistributedDims(VectorType originalType,
- VectorType distributedType) {
- assert(originalType.getRank() == distributedType.getRank() &&
- "sequential and distributed vector types must have the same rank");
- SmallVector<int64_t> distributedDims;
- for (int64_t i = 0; i < originalType.getRank(); ++i) {
- if (distributedType.getDimSize(i) != originalType.getDimSize(i)) {
- distributedDims.push_back(i);
- }
- }
- return distributedDims;
-}
-
-/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
-/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
-/// contained within a WarpExecuteOnLane0Op.
-/// Example:
-///
-/// ```
-/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
-/// ...
-/// ...
-/// gpu.return %result: vector<8x16xf32>
-/// }
-/// ```
-/// To
-/// ```
-/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
-/// %laneid = gpu.lane_id : index
-/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
-/// ...
-/// ...
-/// gpu.yield %result: vector<8x16xf32>
-/// }
-/// return %0
-/// }
-struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
- using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
- LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
- PatternRewriter &rewriter) const override {
- auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
- if (!uArch)
- return rewriter.notifyMatchFailure(
- gpuFuncOp, "Subgroup distribution requires target attribute attached "
- "to set the warp size");
- if (!gpuFuncOp.getBody().hasOneBlock())
- return rewriter.notifyMatchFailure(
- gpuFuncOp, "expected gpu.func to have a single block");
-
- // If the function only contains a single void return, skip.
- if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
- return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
- }))
- return failure();
- // If the function already moved inside a warp_execute_on_lane0, skip.
- if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
- return isa<gpu::WarpExecuteOnLane0Op>(op);
- }))
- return failure();
- gpu::ReturnOp origReturnOp = dyn_cast_if_present<gpu::ReturnOp>(
- gpuFuncOp.getBlocks().back().getTerminator());
- if (!origReturnOp)
- return rewriter.notifyMatchFailure(
- gpuFuncOp, "expected gpu.func terminator to be gpu.return");
- // Create a new function with the same signature and same attributes.
- SmallVector<Type> workgroupAttributionsTypes =
- llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributionBBArgs(),
- [](BlockArgument arg) { return arg.getType(); });
- SmallVector<Type> privateAttributionsTypes =
- llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
- [](BlockArgument arg) { return arg.getType(); });
- auto newGpuFunc = gpu::GPUFuncOp::create(
- rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
- gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
- privateAttributionsTypes);
- newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
- // Create a WarpExecuteOnLane0Op with same arguments and results as the
- // original gpuFuncOp.
- rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
- auto laneId = gpu::LaneIdOp::create(
- rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
- /** upperBound = **/ mlir::IntegerAttr());
- ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
- auto warpOp = gpu::WarpExecuteOnLane0Op::create(
- rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
- uArch->getSubgroupSize(), newGpuFunc.getArguments(),
- newGpuFunc.getArgumentTypes());
- Block &warpBodyBlock = warpOp.getBodyRegion().front();
- // Replace the ReturnOp of the original gpu function with a YieldOp.
- rewriter.setInsertionPointAfter(origReturnOp);
- gpu::YieldOp::create(rewriter, origReturnOp.getLoc(),
- origReturnOp.getOperands());
- rewriter.eraseOp(origReturnOp);
- // Move the original function body to the WarpExecuteOnLane0Op body.
- rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
- warpOp.getBodyRegion().begin());
- rewriter.eraseBlock(&warpBodyBlock);
- // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
- rewriter.setInsertionPointAfter(warpOp);
- gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
- rewriter.replaceOp(gpuFuncOp, newGpuFunc);
- return success();
- }
-};
-
-/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
-/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
-/// still contain the original op that will not be used by the yield op (and
-/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
-/// arguments. Tensor descriptor shape is not distributed because it is a
-/// uniform value across all work items within the subgroup. However, the
-/// layout information is dropped in the new tensor descriptor type.
-///
-/// Example:
-///
-/// ```
-/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
-/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
-/// ...
-/// %td = xegpu.create_nd_tdesc %arg0
-/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
-/// vector.yield %td
-/// }
-/// ```
-/// To
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
-/// ...
-/// %dead = xegpu.create_nd_tdesc %arg0
-/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
-/// vector.yield %arg0, %dead
-/// }
-/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
-/// -> !xegpu.tensor_desc<4x8xf32>
-///
-/// ```
-struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand =
- getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp, "warp result is not a xegpu::CreateNdDesc op");
- auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
- unsigned operandIdx = operand->getOperandNumber();
-
- xegpu::DistributeLayoutAttr layout = descOp.getType().getLayoutAttr();
- if (!layout)
- return rewriter.notifyMatchFailure(
- descOp, "the tensor descriptor lacks layout attribute");
- SmallVector<size_t> newRetIndices;
- rewriter.setInsertionPoint(warpOp);
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
- /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
-
- SmallVector<Value> newDescOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
- rewriter.setInsertionPointAfter(newWarpOp);
- xegpu::TensorDescType distributedTensorDescTy =
- descOp.getType().dropLayouts(); // Distributed tensor descriptor type
- // does not contain layout info.
- Value newDescOp = xegpu::CreateNdDescOp::create(
- rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
- descOp->getAttrs());
-
- Value distributedVal = newWarpOp.getResult(operandIdx);
- // Resolve the distributed type to the expected type.
- newDescOp =
- resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
- rewriter.replaceAllUsesWith(distributedVal, newDescOp);
- return success();
- }
-};
-
-/// Distribute a store_nd op at the end of enclosing
-/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
-/// through the warp op interface they would be propagated as returned values.
-/// Source vector is distributed based on lane layout. Appropriate cast ops are
-/// inserted if the distributed types does not match expected xegpu SIMT types.
-///
-/// Example:
-///
-/// ```
-/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-/// gpu.warp_execute_on_lane_0(%laneid) -> () {
-/// ...
-/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
-/// !xegpu.tensor_desc<4x8xf32, #layout0>
-/// }
-/// ```
-/// To
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
-/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
-/// ...
-/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
-/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
-/// }
-/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
-/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
-/// #layout0>
-/// -> !xegpu.tensor_desc<4x8xf32>
-/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
-/// !xegpu.tensor_desc<4x8xf32>
-///
-/// ```
-struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- gpu::YieldOp yield = warpOp.getTerminator();
- Operation *lastNode = yield->getPrevNode();
- auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
- if (!storeOp)
- return failure();
-
- SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
- // Expecting offsets to be present.
- if (offsets.empty())
- return rewriter.notifyMatchFailure(storeOp,
- "the store op must have offsets");
- SmallVector<Value> offsetsAsValues =
- vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
- SmallVector<Type> offsetTypes = llvm::map_to_vector(
- offsetsAsValues, [](Value v) { return v.getType(); });
- xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
- xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
- if (!layout)
- return rewriter.notifyMatchFailure(
- storeOp, "the source tensor descriptor lacks layout attribute");
-
- FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
- xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
- if (failed(distributedTypeByWarpOpOrFailure))
- return rewriter.notifyMatchFailure(storeOp,
- "Failed to distribute the type");
- VectorType distributedTypeByWarpOp =
- distributedTypeByWarpOpOrFailure.value();
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldedValues = {storeOp.getValue(),
- storeOp.getTensorDesc()};
- SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
- newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
- newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
- // Create a new store op outside the warp op with the distributed vector
- // type. Tensor descriptor is not distributed.
- rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newStoreOperands;
-
- // For the value operand, there can be a mismatch between the vector type
- // distributed by the warp op and (xegpu-specific) distributed type
- // supported by the store op. Type mismatch must be resolved using
- // appropriate cast op.
- FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
- xegpu::getDistributedVectorType(storeOp.getTensorDescType());
- if (failed(storeNdDistributedValueTyOrFailure))
- return rewriter.notifyMatchFailure(
- storeOp, "Failed to get distributed vector type for the store op");
- newStoreOperands.push_back(resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[0]),
- storeNdDistributedValueTyOrFailure.value(), rewriter));
- // For the tensor descriptor operand, the layout attribute is dropped after
- // distribution. Types needs to be resolved in this case also.
- xegpu::TensorDescType distributedTensorDescTy =
- storeOp.getTensorDescType().dropLayouts();
- newStoreOperands.push_back(
- resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
- distributedTensorDescTy, rewriter));
- // Collect offsets.
- for (size_t i = 2; i < newRetIndices.size(); ++i)
- newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
-
- auto newStoreOp =
- xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
- newStoreOperands, storeOp->getAttrs());
- xegpu::removeLayoutAttrs(newStoreOp);
- rewriter.eraseOp(storeOp);
- return success();
- }
-};
-
-/// Distribute a load_nd op feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
-/// The warp op will still contain the original op that will not be used by
-/// the yield op (and should be cleaned up later). The yield op will
-/// bypass the load's arguments. Only the loaded vector is distributed
-/// according to lane layout and, tensor descriptor types is not
-/// distributed. Appropriate cast ops are inserted if the distributed types does
-/// not match expected xegpu SIMT types.
-///
-/// Example:
-///
-/// ```
-/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
-/// (vector<4x1xf32>) {
-/// ...
-/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
-/// ->
-/// vector<4x8xf32>
-/// gpu.yield %ld
-/// }
-/// ```
-/// To
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
-/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
-/// ...
-/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
-/// vector<4x8xf32> gpu.yield %dead, %arg0
-/// }
-/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
-/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
-/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
-/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
-///
-/// ```
-struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
- if (!isa<xegpu::LoadNdOp>(op))
- return false;
- // Make sure the same load op is the last operation in the warp op body.
- // This ensure that load op is not sinked earlier violating any barrier
- // synchronizations.
- gpu::YieldOp yield = warpOp.getTerminator();
- return yield->getPrevNode() == op;
- });
-
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp, "warp result is not a xegpu::LoadNd op");
-
- auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
- auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
- if (!uArch)
- return rewriter.notifyMatchFailure(
- loadOp, "xegpu::LoadNdOp require target attribute attached to "
- "determine transpose "
- "requirement");
- // Chip information is required to decide if the layout requires transpose
- // effect.
- // Expecting offsets to be present.
- SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
- if (offsets.empty())
- return rewriter.notifyMatchFailure(loadOp,
- "the load op must have offsets");
- SmallVector<Value> offsetsAsValues =
- vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
- SmallVector<Type> offsetTypes = llvm::map_to_vector(
- offsetsAsValues, [](Value v) { return v.getType(); });
-
- xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
- xegpu::DistributeLayoutAttr layout = tensorDescTy.getLayoutAttr();
- if (!layout)
- return rewriter.notifyMatchFailure(
- loadOp, "the source tensor descriptor lacks layout attribute");
-
- unsigned operandIdx = operand->getOperandNumber();
- VectorType distributedTypeByWarpOp =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
- SmallVector<Type> newYieldedTypes = {tensorDescTy};
- newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
- newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
-
- // Create a new load op outside the warp op with the distributed vector
- // type.
- rewriter.setInsertionPointAfter(newWarpOp);
- FailureOr<VectorType> loadNdDistValueTyOrFailure =
- xegpu::getDistributedVectorType(loadOp.getTensorDescType());
- if (failed(loadNdDistValueTyOrFailure))
- return rewriter.notifyMatchFailure(
- loadOp, "Failed to get distributed vector type for the load op");
- xegpu::TensorDescType distributedTensorDescTy =
- loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
- // descriptor type does not
- // contain layout info.
- SmallVector<Value> newLoadOperands{
- resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
- distributedTensorDescTy, rewriter)};
- // Collect offsets.
- for (size_t i = 1; i < newRetIndices.size(); ++i)
- newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
- auto newLoadOp = xegpu::LoadNdOp::create(
- rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
- newLoadOperands, loadOp->getAttrs());
- xegpu::removeLayoutAttrs(newLoadOp);
- // Set the packed attribute if the layout requires it.
- newLoadOp.setPacked(xegpu::requirePacked(layout));
- // Set the transpose attribute if the layout requires it.
- if (xegpu::requireTranspose(layout, uArch))
- newLoadOp.setTranspose(
- DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
- Value distributedVal = newWarpOp.getResult(operandIdx);
- // There can be a conflict between the vector type distributed by the
- // warp op and (xegpu-specific) distributed type supported by the load
- // op. Resolve these mismatches by inserting a cast.
- Value tyResolvedVal = resolveDistributedTy(
- newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
- rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
- return success();
- }
-};
-
-/// Distribute a dpas op feeding into vector.yield op for the enclosing
-/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
-/// The warp op will still contain the original op that will not be used by
-/// the yield op (and should be cleaned up later). The yield op will
-/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
-/// distributed types does not match expected xegpu SIMT types.
-/// Example:
-/// ```
-/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
-/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
-/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
-/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
-/// (vector<8x1xf32>) {
-/// ...
-/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
-/// vector<8x16xf32>
-/// gpu.yield %dpas
-/// }
-/// ```
-/// To
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
-/// vector<8x1xf16>, vector<16x1xf16>) {
-/// ...
-/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
-/// -> vector<8x16xf32>
-/// gpu.yield %dead, %arg0, %arg1
-/// }
-/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
-/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
-/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
-/// vector<8xf32>
-/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
-/// ```
-struct DpasDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(warpOp,
- "warp result is not a xegpu::Dpas op");
-
- auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
- unsigned operandIdx = operand->getOperandNumber();
-
- xegpu::LayoutAttr layoutA =
- dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
- xegpu::LayoutAttr layoutB =
- dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
- xegpu::LayoutAttr layoutOut =
- dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
-
- if (!layoutA || !layoutB || !layoutOut)
- return rewriter.notifyMatchFailure(
- dpasOp,
- "the xegpu::Dpas op lacks layout attribute for A, B or output");
-
- FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
- FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
- FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
-
- if (failed(distLhsTypeByWarpOpOrFailure) ||
- failed(distRhsTypeByWarpOpOrFailure) ||
- failed(distResultTypeByWarpOpOrFailure))
- return rewriter.notifyMatchFailure(
- dpasOp,
- "Failed to distribute the A, B or output types in xegpu::Dpas op");
-
- llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
- dpasOp.getRhs()};
- llvm::SmallVector<Type, 3> newYieldTypes{
- distLhsTypeByWarpOpOrFailure.value(),
- distRhsTypeByWarpOpOrFailure.value()};
- // Dpas acc operand is optional.
- if (dpasOp.getAcc()) {
- newYieldValues.push_back(dpasOp.getAcc());
- newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
- }
- // Create a new warp op without the dpas.
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
-
- FailureOr<VectorType> expectedDistLhsTyOrFailure =
- xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
- FailureOr<VectorType> expectedDistRhsTyOrFailure =
- xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
- FailureOr<VectorType> expectedDistResultTyOrFailure =
- xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
-
- if (failed(expectedDistLhsTyOrFailure) ||
- failed(expectedDistRhsTyOrFailure) ||
- failed(expectedDistResultTyOrFailure))
- return rewriter.notifyMatchFailure(
- dpasOp,
- "Failed to get distributed vector type for the dpas operands.");
- // Create a new dpas op outside the warp op.
- rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newDpasOperands;
- SmallVector<VectorType> newDpasOperandExpectedTypes;
-
- // Resolve the distributed types with the original types.
- newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
- newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
- VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
- if (dpasOp.getAcc())
- newDpasOperandExpectedTypes.push_back(distributedResultTy);
-
- for (unsigned i = 0; i < newRetIndices.size(); i++) {
- newDpasOperands.push_back(
- resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
- newDpasOperandExpectedTypes[i], rewriter));
- }
- auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
- distributedResultTy, newDpasOperands,
- dpasOp->getAttrs());
- xegpu::removeLayoutAttrs(newDpasOp);
- Value distributedVal = newWarpOp.getResult(operandIdx);
- // Resolve the output type.
- Value typeResolved =
- resolveDistributedTy(newDpasOp.getResult(),
- distResultTypeByWarpOpOrFailure.value(), rewriter);
- rewriter.replaceAllUsesWith(distributedVal, typeResolved);
- return success();
- }
-};
-
-/// Distribute a prefetch_nd op at the end of enclosing
-/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
-/// through the warp op interface they would be propagated as returned values.
-/// Tensor descriptor shape is not distributed because it is a uniform value
-/// across all work items within the subgroup. Appropriate cast ops are inserted
-/// if the distributed types does not match expected xegpu SIMT types.
-///
-/// Example:
-///
-/// ```
-/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-/// gpu.warp_execute_on_lane_0(%laneid) -> () {
-/// ...
-/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
-/// }
-/// ```
-/// To
-/// ```
-/// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
-/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
-/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
-/// index
-/// }
-/// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
-/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
-/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
-///
-/// ```
-struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- gpu::YieldOp yield = warpOp.getTerminator();
- Operation *lastNode = yield->getPrevNode();
- auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
- if (!prefetchOp)
- return failure();
-
- SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
- // PrefetchNdOp must have offsets.
- if (offsets.empty())
- return rewriter.notifyMatchFailure(prefetchOp,
- "the prefetch op must have offsets");
- SmallVector<Value> offsetsAsValues =
- vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
- SmallVector<Type> offsetTypes = llvm::map_to_vector(
- offsetsAsValues, [](Value v) { return v.getType(); });
-
- xegpu::DistributeLayoutAttr layout =
- prefetchOp.getTensorDescType().getLayoutAttr();
- if (!layout)
- return rewriter.notifyMatchFailure(
- prefetchOp, "the source tensor descriptor lacks layout attribute");
-
- SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
- SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
- newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
- newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
- // Create a new prefetch op outside the warp op with updated tensor
- // descriptor type. Source tensor descriptor require type resolution.
- xegpu::TensorDescType newTensorDescTy =
- prefetchOp.getTensorDescType().dropLayouts();
- rewriter.setInsertionPointAfter(newWarpOp);
- SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
- // Collect offsets.
- for (size_t i = 1; i < newRetIndices.size(); ++i)
- newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
- Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
- rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
- prefetchOp->getAttrs());
- xegpu::removeLayoutAttrs(newPrefetchOp);
- rewriter.eraseOp(prefetchOp);
- return success();
- }
-};
-
-/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
-/// region. This will simply move the barrier op outside of the warp op.
-struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- gpu::YieldOp yield = warpOp.getTerminator();
- Operation *lastNode = yield->getPrevNode();
- // The last node must be a gpu::BarrierOp.
- auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
- if (!barrierOp)
- return failure();
- // Move the barrier op outside of the warp op.
- rewriter.setInsertionPointAfter(warpOp);
- gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
- barrierOp->getResultTypes(),
- barrierOp->getOperands(), barrierOp->getAttrs());
- rewriter.eraseOp(barrierOp);
- return success();
- }
-};
-
-/// Distribute a scattered store op. The offsets argument is required.
-/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
-/// The layouts are fixed and implicit: one offset/mask per lane.
-/// The pass changes the offset/mask vector shapes to a
-/// single-element vector, **it is assumed that their producer will also be
-/// distributed**. The payload vector also has a fixed distribution:
-/// no chunk size -> vector of one element.
-/// chunk size -> vector of the innermost dimension of the SG-payload.
-/// Example 1 (no chunk size):
-/// %mask = producer_op : vector<16xi1>
-/// %offset = producer_op : vector<16xindex>
-/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
-/// memref<256xf16>, vector<16xindex>, vector<16xi1>
-/// To
-/// %mask = producer_op : vector<1xi1>
-/// %offset = producer_op : vector<1xindex>
-/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
-/// memref<256xf16>, vector<1xindex>, vector<1xi1>
-/// Example 2 (chunk size, same mask and offsets):
-/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-/// To
-/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-///
-/// Note that the store distribution pattern also handles leading unit
-/// dimensions in the payload, mask and offsets vectors. In this case the store
-/// distribution will only change the dimensions corresponding to the SG
-/// distribution and keep the leading unit dimensions unchanged.
-/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
-/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
-/// offset/mask/payload when necessary so that the distributed store is workign
-/// on 1D shape vector to match the HW capability.
-struct StoreDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- Operation *lastNode = warpOp.getTerminator()->getPrevNode();
- auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
- if (!storeScatterOp)
- return failure();
- Value offsets = storeScatterOp.getOffsets();
- if (!isa<VectorType>(offsets.getType()))
- return rewriter.notifyMatchFailure(
- storeScatterOp, "Store op must have a vector of offsets argument");
- VectorType offsetsTy = cast<VectorType>(offsets.getType());
- VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
- VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
-
- // Add handling for leading unit dimensions support
- int chunkSize = storeScatterOp.getChunkSize().value_or(1);
- int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
-
- // Check that all leading dimensions are unit dimensions
- for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
- if (storeVecTy.getShape()[i] != 1) {
- return rewriter.notifyMatchFailure(
- storeScatterOp, "Only unit dimensions allowed for the leading "
- "dimensions of the store vector!");
- }
- }
-
- auto layoutPayload = storeScatterOp.getLayoutAttr();
- auto layoutOffsets =
- xegpu::inferMaskOffsetLayoutForScatterIO(layoutPayload, chunkSize);
- auto layoutMask = layoutOffsets;
-
- FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
- FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
- FailureOr<VectorType> distMaskByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
- if (failed(distStoreVecByWarpOpOrFailure) ||
- failed(distOffsetsByWarpOpOrFailure) ||
- failed(distMaskByWarpOpOrFailure)) {
- return rewriter.notifyMatchFailure(
- storeScatterOp,
- "Some vector operands have no layouts, using defaults instead.");
- }
-
- VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
- VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
- VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> operands = storeScatterOp->getOperands();
- SmallVector<Type> operandTypesToYield = {
- distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};
-
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
-
- rewriter.setInsertionPointAfter(newWarpOp);
-
- // Distributed store payload type is always 1D without leading unit dims
- VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
- distPayloadTy.getElementType());
-
- VectorType distOffsetsTy1D = VectorType::get(
- {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
- VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
- distMaskTy.getElementType());
-
- // Resolve distributed types to 1D for SIMT execution
- Value distPayloadVal = resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
- Value distOffsetVal = resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
- Value distMaskVal = resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);
-
- SmallVector<Value> newStoreScatterOpOperands = {
- distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
- distMaskVal};
-
- xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
- rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
- storeScatterOp->getAttrs());
- xegpu::removeLayoutAttrs(newOp);
- rewriter.eraseOp(storeScatterOp);
- return success();
- }
-};
-
-static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
- PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
- Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
- SmallVector<Value> newCoods;
- auto maybeCoords =
- layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
- if (failed(maybeCoords))
- return {};
- assert(maybeCoords.value().size() == 1 &&
- "Expected one set of distributed offsets");
- SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
- rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
- getAsOpFoldResult(origOffsets));
- newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
- return newCoods;
-}
-
-/// Pattern for distributing xegpu::LoadMatrixOp.
-struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- gpu::YieldOp yield = warpOp.getTerminator();
- Operation *lastNode = yield->getPrevNode();
- auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
- if (!matrixOp)
- return failure();
-
- OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
- return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
- });
- if (!producedByLastLoad)
- return rewriter.notifyMatchFailure(
- warpOp, "The last op is not xegpu::LoadMatrixOp");
- const int operandIdx = producedByLastLoad->getOperandNumber();
-
- VectorType sgPayloadTy =
- dyn_cast<VectorType>(matrixOp.getResult().getType());
- VectorType warpResultTy =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
- if (!sgPayloadTy)
- return rewriter.notifyMatchFailure(
- matrixOp, "the matrix op payload must be a vector type");
-
- auto loc = matrixOp.getLoc();
- auto offsets = matrixOp.getMixedOffsets();
- if (offsets.empty())
- return rewriter.notifyMatchFailure(matrixOp,
- "the load op must have offsets");
- SmallVector<Value> offsetsAsValues =
- vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
-
- auto layout = matrixOp.getLayoutAttr();
- if (!layout)
- return rewriter.notifyMatchFailure(
- matrixOp, "the matrix operation lacks layout attribute");
-
- FailureOr<VectorType> distPayloadByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
- if (failed(distPayloadByWarpOpOrFailure))
- return rewriter.notifyMatchFailure(
- matrixOp, "Failed to distribute matrix op payload based on layout.");
-
- SmallVector<Value> operands = {matrixOp.getMemDesc()};
- const unsigned offsetsStartIdx = operands.size();
- operands.append(offsetsAsValues);
-
- SmallVector<Type> operandTypes =
- llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
-
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypes, newRetIndices);
- SmallVector<Value> newOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
- SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
- ShapedType::kDynamic);
- DenseI64ArrayAttr newConstOffsetsAttr =
- rewriter.getDenseI64ArrayAttr(newConstOffsets);
- ValueRange currentOffsets =
- ValueRange(newOperands).drop_front(offsetsStartIdx);
-
- SmallVector<Value> newCoords = currentOffsets;
- rewriter.setInsertionPointAfter(newWarpOp);
-
- if (!matrixOp.getSubgroupBlockIoAttr()) {
- newCoords = computeDistributedCoordinatesForMatrixOp(
- rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
- currentOffsets);
- }
- xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
- rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
- newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
- matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
- // Resolve the output type and replace all uses.
- rewriter.replaceAllUsesWith(
- newWarpOp.getResult(operandIdx),
- resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
- return success();
- }
-};
-
-/// Pattern for distributing xegpu::StoreMatrixOp.
-struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- gpu::YieldOp yield = warpOp.getTerminator();
- Operation *lastNode = yield->getPrevNode();
- auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
- if (!matrixOp)
- return failure();
-
- VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
- if (!sgPayloadTy)
- return rewriter.notifyMatchFailure(
- matrixOp, "the matrix op payload must be a vector type");
-
- auto loc = matrixOp.getLoc();
- auto offsets = matrixOp.getMixedOffsets();
- if (offsets.empty())
- return rewriter.notifyMatchFailure(matrixOp,
- "the store op must have offsets");
- SmallVector<Value> offsetsAsValues =
- vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
-
- auto layout = matrixOp.getLayoutAttr();
- if (!layout)
- return rewriter.notifyMatchFailure(
- matrixOp, "the matrix operation lacks layout attribute");
-
- FailureOr<VectorType> distPayloadByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
- if (failed(distPayloadByWarpOpOrFailure))
- return rewriter.notifyMatchFailure(
- matrixOp, "Failed to distribute matrix op payload based on layout.");
-
- SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
- const unsigned offsetsStartIdx = operands.size();
- operands.append(offsetsAsValues);
-
- SmallVector<Type> operandTypes =
- llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
- operandTypes[0] = *distPayloadByWarpOpOrFailure;
-
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypes, newRetIndices);
- SmallVector<Value> newOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
- SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
- ShapedType::kDynamic);
- DenseI64ArrayAttr newConstOffsetsAttr =
- rewriter.getDenseI64ArrayAttr(newConstOffsets);
- ValueRange currentOffsets =
- ValueRange(newOperands).drop_front(offsetsStartIdx);
-
- SmallVector<Value> newCoords = currentOffsets;
- rewriter.setInsertionPointAfter(newWarpOp);
-
- if (!matrixOp.getSubgroupBlockIoAttr()) {
- newCoords = computeDistributedCoordinatesForMatrixOp(
- rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
- currentOffsets);
- }
-
- xegpu::StoreMatrixOp::create(
- rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
- ValueRange(newCoords), newConstOffsetsAttr,
- matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
- rewriter.eraseOp(matrixOp);
- return success();
- }
-};
-
-/// Distribute a scattered load op. The logic and requirements are the same as
-/// for the scattered store distribution. The warpOp's payload vector is
-/// expected to be distributed by the load's result consumer.
-/// Example 1 (no chunk size):
-/// %mask = producer_op : vector<16xi1>
-/// %offset = producer_op : vector<16xindex>
-/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
-/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
-/// To
-/// %mask = producer_op : vector<1xi1>
-/// %offset = producer_op : vector<1xindex>
-/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
-/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
-/// Example 2 (chunk size, same mask and offsets):
-/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-/// To
-/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
-/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-///
-/// Note that the load distribution pattern also handles leading unit dimensions
-/// in the payload, mask, and offsets vector.The load distribution will only
-/// change the dimensions corresponding to the SG distribution and keep the
-/// leading unit dimensions unchanged. For example, a load with result type
-/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
-/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
-/// offset/mask/payload when necessary so that the distributed load is workign
-/// on 1D shape vector to match the HW capability.
-struct LoadDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
- // Check if the yield operand that was produced by the *last* scattered
- // load op to avoid sinking it before barriers (maintain memory order).
- return isa<xegpu::LoadGatherOp>(op) &&
- warpOp.getTerminator()->getPrevNode() == op;
- });
- if (!producedByLastLoad)
- return rewriter.notifyMatchFailure(
- warpOp, "The last op is not xegpu::LoadGatherOp");
-
- auto loadGatherOp =
- producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
- Value offsets = loadGatherOp.getOffsets();
- if (!isa<VectorType>(offsets.getType()) ||
- !isa<VectorType>(loadGatherOp.getMask().getType()))
- return rewriter.notifyMatchFailure(
- loadGatherOp,
- "Load op must have vector arguments for offsets and mask");
- VectorType offsetsTy = cast<VectorType>(offsets.getType());
- VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
- VectorType resultVecTy =
- cast<VectorType>(loadGatherOp.getResult().getType());
- // add handling leading unit dimensions support
- int chunkSize = loadGatherOp.getChunkSize().value_or(1);
- int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
- for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
- if (resultVecTy.getShape()[i] != 1) {
- return rewriter.notifyMatchFailure(
- loadGatherOp, "Only unit dimensions allowed for the leading "
- "dimensions of the load vector!");
- }
- }
-
- auto layoutPayload = loadGatherOp.getLayoutAttr();
- auto layoutOffsets =
- xegpu::inferMaskOffsetLayoutForScatterIO(layoutPayload, chunkSize);
- auto layoutMask = layoutOffsets;
-
- FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
- FailureOr<VectorType> distMaskByWarpOpOrFailure =
- getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
- if (failed(distOffsetsByWarpOpOrFailure) ||
- failed(distMaskByWarpOpOrFailure)) {
- return rewriter.notifyMatchFailure(
- loadGatherOp,
- "Some vector operands have no layouts, using defaults instead.");
- }
-
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> operands = loadGatherOp->getOperands();
-
- const unsigned operandIdx = producedByLastLoad->getOperandNumber();
- VectorType distResultTy =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
- VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
- VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
-
- SmallVector<Type> operandTypesToYield = {operands[0].getType(),
- distOffsetsTy, distMaskTy};
-
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
-
- rewriter.setInsertionPointAfter(newWarpOp);
-
- // Distributed load op will always be 1D.
- VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
- distResultTy.getElementType());
-
- VectorType distOffsetsTy1D =
- VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
- distOffsetsByWarpOpOrFailure.value().getElementType());
- VectorType distMaskTy1D =
- VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
- distMaskByWarpOpOrFailure.value().getElementType());
-
- Value distOffsetVal = resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
- Value distmaskVal = resolveDistributedTy(
- newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
-
- SmallVector<Value> newLoadGatherOperands = {
- newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
-
- xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
- rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
- loadGatherOp->getAttrs());
- xegpu::removeLayoutAttrs(newOp);
- Value distributedVal = newWarpOp.getResult(operandIdx);
- // Resolve the output type and replace all uses.
- rewriter.replaceAllUsesWith(
- distributedVal,
- resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
- return success();
- }
-};
-
-// Sink SG-uniform ops. An op is uniform if none
-// of its operands/results has a distribution layout attribute.
-// Non-uniform vectors are handled by dedicated patterns.
-// This pattern must have a higher priority than vector dialect distribution
-// patterns, because a distributable shape may be logically intended as
-// uniform (i.e., no layout), so we want to omit its distribution.
-struct SinkUniformOps final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- // Take the last op
- Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
- // Any ops with nested regions must be handled carefully in dedicated
- // patterns.
- if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
- return failure();
- int operandIdx = -1;
- if (warpRegionPreYieldOp->getNumResults()) {
- OpOperand *operand = getWarpResult(
- warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
- if (!operand)
- return failure();
- operandIdx = operand->getOperandNumber();
- if (warpRegionPreYieldOp->getResult(0).getType() !=
- warpOp.getResult(operandIdx).getType())
- return rewriter.notifyMatchFailure(warpOp,
- "The op result is not uniform.");
- }
-
- // The op must have no layout-based operands or results.
- bool uniformValuesOnly =
- llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
- return !xegpu::getDistributeLayoutAttr(v);
- });
- uniformValuesOnly &=
- llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
- return !xegpu::getDistributeLayoutAttr(opr);
- });
- if (!uniformValuesOnly)
- return rewriter.notifyMatchFailure(warpOp,
- "Some values are not uniform.");
- SmallVector<size_t> newRetIndices;
- SmallVector<Value> operands =
- llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
- SmallVector<Type> operandTypes =
- llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, operands, operandTypes, newRetIndices);
-
- rewriter.setInsertionPointAfter(newWarpOp);
- IRMapping operandMapper;
- for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
- operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
- newWarpOp->getResult(newOperandIdx));
- Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
- if (!clonedOp->getNumResults())
- rewriter.eraseOp(warpRegionPreYieldOp);
- else {
- assert(operandIdx != -1 && "Expected a warp result for the operation");
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
- clonedOp->getResult(0));
- }
- return success();
- }
-};
-
-/// This patterns distribute the `vector.multi_reduction` operation across
-/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
-/// layouts for the source and accumulator vectors,
-/// * If the reduction dimension is distributed across lanes, the reduction is
-/// non-lane-local and the reduction is done using warp shuffles. Here we
-/// simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
-/// the warp op body.
-/// * If the reduction dimension is not distributed across lanes, the reduction
-/// is lane-local. In this case, we yield the source and accumulator vectors
-/// from the warp op and perform the lane-local reduction outside the warp op
-/// using a sequence of ReductionOps.
-/// Example 1 (Reduction is lane-local):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
-/// %0 = "some_def"() : () -> (vector<16x32xf32>)
-/// %acc = "some_def"() : () -> (vector<32xf32>)
-/// %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
-/// vector<32xf32> gpu.yield %1 : vector<32xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
-/// vector<1xf32>) {
-/// %0 = "some_def"() : () -> (vector<16x32xf32>)
-/// %acc = "some_def"() : () -> (vector<32xf32>)
-/// gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
-/// }
-/// %c = arith.constant dense<0.0> : vector<1xf32>
-/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
-/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
-/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
-/// ```
-/// Example 2 (Reduction is non-lane-local):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
-/// %0 = "some_def"() : () -> (vector<2x32xf32>)
-/// %acc = "some_def"() : () -> (vector<2xf32>)
-/// %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
-/// vector<2xf32>
-/// gpu.yield %1 : vector<2xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
-/// %0 = "some_def"() : () -> (vector<2x32xf32>)
-/// %acc = "some_def"() : () -> (vector<2xf32>)
-/// %1 = arith.constant dense<0.0> : vector<2xf32>
-/// %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
-/// %3 = ("warp.reduction %2") : f32
-/// %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
-/// ... repeat for row 1
-/// gpu.yield %1 : vector<2xf32>
-/// }
-struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *yieldOperand =
- getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
- if (!yieldOperand)
- return failure();
- auto reductionOp =
- cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
- unsigned operandIdx = yieldOperand->getOperandNumber();
- VectorType sourceType = reductionOp.getSourceVectorType();
- int64_t sourceRank = sourceType.getRank();
- // Need at least a 2D source vector.
- if (sourceRank < 2)
- return rewriter.notifyMatchFailure(warpOp,
- "Only 2D+ reductions are supported.");
- // Leading dimensions (first rank-2) must be unit (size 1).
- for (int64_t i = 0; i < sourceRank - 2; ++i) {
- if (sourceType.getShape()[i] != 1)
- return rewriter.notifyMatchFailure(
- warpOp, "Only unit dimensions allowed for the leading dimensions.");
- }
- // Effective dimension indices (last 2 dims of the source).
- int64_t rowIdx = sourceRank - 2;
- int64_t columnIdx = sourceRank - 1;
- ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
- if (reductionDims.size() != 1)
- return rewriter.notifyMatchFailure(warpOp,
- "Only 1 reduction dim is supported.");
- int64_t reductionDim = reductionDims[0];
- // The reduction dim must be among the last 2 dims.
- if (reductionDim != rowIdx && reductionDim != columnIdx)
- return rewriter.notifyMatchFailure(
- warpOp, "Reduction dim must be among the last 2 dimensions.");
- VectorType distributedResultType =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
- VectorType resultType = cast<VectorType>(reductionOp.getType());
- xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
-
- FailureOr<VectorType> sourceDistTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
- if (failed(sourceDistTypeOrFailure))
- return rewriter.notifyMatchFailure(
- warpOp, "Failed to distribute the source vector type.");
- VectorType sourceDistType = sourceDistTypeOrFailure.value();
- // Only single dimension distribution among the last 2 dims is supported.
- bool rowDistributed =
- sourceDistType.getShape()[rowIdx] != sourceType.getShape()[rowIdx];
- bool columnDistributed = sourceDistType.getShape()[columnIdx] !=
- sourceType.getShape()[columnIdx];
- if (rowDistributed && columnDistributed)
- return rewriter.notifyMatchFailure(
- warpOp, "Expecting source to be distributed in a single dimension.");
- int64_t sourceDistDim =
- rowDistributed ? rowIdx : (columnDistributed ? columnIdx : -1);
- if (sourceDistDim == -1)
- return rewriter.notifyMatchFailure(
- warpOp, "Expecting a distributed source vector.");
- bool resultDistributed =
- distributedResultType.getNumElements() < resultType.getNumElements();
- // If the lane owns all the data required for reduction (i.e. reduction is
- // fully parallel accross lanes), then each lane owns part of the result
- // (i.e. result is distributed). If the reduction require cross-lane
- // shuffling, then the result is shared among all lanes (broadcasted).
- // Therefore we expect following cases:
- //
- // | Source vector | Reduction dim | Result vector |
- // |----------------------|----------------|----------------|
- // | dim-0 distributed | 0 | broadcasted |
- // | dim-0 distributed | 1 | distributed |
- // | dim-1 distributed | 0 | distributed |
- // | dim-1 distributed | 1 | broadcasted |
-
- bool isReductionLaneLocal =
- (sourceDistDim == rowIdx && reductionDim == columnIdx) ||
- (sourceDistDim == columnIdx && reductionDim == rowIdx);
- if (isReductionLaneLocal && !resultDistributed)
- return rewriter.notifyMatchFailure(
- warpOp, "Expecting a distributed result for lane-local reduction.");
-
- if (!isReductionLaneLocal && resultDistributed)
- return rewriter.notifyMatchFailure(
- warpOp,
- "Expecting a broadcasted result for non-lane-local reduction.");
-
- // Handle lane-local reduction case. In this case we fully distribute the
- // reduction result.
- if (isReductionLaneLocal) {
- // Yield the source and acc vectors from the WarpOp.
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
- {sourceDistType, distributedResultType}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- Value result = xegpu::lowerToVectorReductions(
- cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
- cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
- reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
- // Replace the warp op result with the final result.
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
- return success();
- }
- // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
- // of multiple ReductionOps. Actual distribution is done by the
- // WarpOpReduction pattern.
- rewriter.setInsertionPointAfter(reductionOp);
- Value result = xegpu::lowerToVectorReductions(
- cast<TypedValue<VectorType>>(reductionOp.getSource()),
- cast<TypedValue<VectorType>>(reductionOp.getAcc()),
- reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
- // Replace the warp op result with the final result.
- rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
- return success();
- }
-};
-
-/// This pattern distributes the `vector.broadcast` operation across lanes in a
-/// warp. The pattern supports three use cases:
-///
-/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
-/// vector
-/// must have a slice layout of the result. If the distributed source and
-/// target vector types are identical, this lowers to a no-op; otherwise, it
-/// remains a broadcast but operates on distributed vectors.
-///
-/// 2) Broadcast a same-rank vector with identical layouts for source and
-/// target:
-/// The source vector must have unit dimensions, and lane_data must be unit
-/// size for those unit dims. This always lowers to a no-op.
-///
-/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
-/// scalar to distributed result type.
-///
-/// Example 1 (lowering to a broadcast with distributed types):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
-/// %0 = "some_def"() {layout_result_0 =
-/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-/// dims = [0]> } : () -> (vector<32xf32>)
-/// %2 = vector.broadcast %0 {layout_result_0 =
-/// #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
-/// : vector<32xf32> to vector<8x32xf32>
-/// gpu.yield %1 : vector<8x32xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
-/// %0 = "some_def"() {layout_result_0 =
-/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-/// dims = [0]> } : () -> (vector<32xf32>)
-/// gpu.yield %0 : vector<32xf32>
-/// }
-/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
-///
-/// Example 2 (no-op):
-/// ```
-/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
-/// %0 = "some_def"() {layout_result_0 =
-/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-/// dims = [1]> } : () -> (vector<8xf32>)
-/// %1 = vector.shape_cast %0
-/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
-/// 1]>}: vector<8xf32> to vector<8x1xf32>
-/// %2 = vector.broadcast %1
-/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
-/// 1]>}: vector<8x1xf32> to vector<8x32xf32>
-/// gpu.yield %1 : vector<8x32xf32>
-/// }
-/// ```
-/// is lowered to:
-/// ```
-/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
-/// %0 = "some_def"() {layout_result_0 =
-/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
-/// dims = [1]> } : () -> (vector<8xf32>)
-/// %1 = vector.shape_cast %0
-/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
-/// 1]>}: vector<8xf32> to vector<8x1xf32>
-/// gpu.yield %1 : vector<8x1xf32>
-/// }
-/// // The broadcast is implicit through layout transformation (no-op)
-/// "some_use"(%r#0)
-/// ```
-struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *yieldOperand =
- getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
- if (!yieldOperand)
- return failure();
- auto broadcastOp =
- cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
- unsigned operandIdx = yieldOperand->getOperandNumber();
-
- VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
- VectorType destType =
- dyn_cast<VectorType>(broadcastOp.getResult().getType());
-
- xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
- xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));
-
- FailureOr<VectorType> sourceDistType;
- Type sourceElemOrDistType;
- if (sourceType) {
-
- // Case 1 and 2: source is a vector type.
- int64_t rankDiff = destType.getRank() - sourceType.getRank();
- if (rankDiff > 0) {
- // Case 1: source is lower-rank than result.
- bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
- if (!isSliceOf)
- broadcastOp.emitWarning()
- << "Broadcast input layout must be a slice of result layout.";
- }
- // case 2: source and result have same rank
- if (rankDiff == 0) {
- auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
- SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
- broadcastUnitDimsSet.end());
- assert(sourceLayout.isEqualTo(
- sourceLayout.setUnitDimData(broadcastUnitDims)) &&
- "The sg_data for unit dimensions should be set as 1");
- sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
- }
-
- sourceDistType =
- getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
- if (failed(sourceDistType)) {
- return rewriter.notifyMatchFailure(
- warpOp, "Failed to distribute the source vector type.");
- }
- sourceElemOrDistType = sourceDistType.value();
-
- } else {
- // Case 3: source is a scalar type.
- if (sourceLayout) {
- return rewriter.notifyMatchFailure(
- warpOp, "Broadcast from scalar must not have a layout attribute.");
- }
- sourceElemOrDistType = broadcastOp.getSourceType();
- }
- FailureOr<VectorType> destDistType =
- getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
- if (failed(destDistType)) {
- return rewriter.notifyMatchFailure(
- warpOp, "Failed to distribute the dest vector type.");
- }
-
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
- newRetIndices);
-
- Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
-
- Value newBroadcast = distributedSource;
-
- if (sourceElemOrDistType != destDistType.value()) {
- rewriter.setInsertionPointAfter(newWarpOp);
- newBroadcast =
- vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
- destDistType.value(), distributedSource);
- }
-
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
- return success();
- }
-};
-
-/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region.
-struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *yieldOperand =
- getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
- if (!yieldOperand)
- return failure();
- auto shapeCastOp =
- cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
- unsigned operandNumber = yieldOperand->getOperandNumber();
- auto resultDistTy =
- cast<VectorType>(warpOp.getResult(operandNumber).getType());
- xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
- xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
- if (!sourceLayout || !resultLayout)
- return rewriter.notifyMatchFailure(
- warpOp,
- "the source or result of shape_cast op lacks distribution layout");
-
- FailureOr<VectorType> sourceDistTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(sourceLayout,
- shapeCastOp.getSourceVectorType());
- if (failed(sourceDistTypeOrFailure))
- return rewriter.notifyMatchFailure(
- warpOp, "failed to get distributed vector type for source");
- VectorType sourceDistType = sourceDistTypeOrFailure.value();
- // Create a new warp op that yields the source of the shape_cast op.
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
- newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- Value source = newWarpOp.getResult(newRetIndices[0]);
- // Create a new shape_cast op outside the warp op.
- Value newShapeCast = vector::ShapeCastOp::create(
- rewriter, shapeCastOp.getLoc(), resultDistTy, source);
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
- newShapeCast);
- return success();
- }
-};
-
-// Distribute a `vector.extract_strided_slice` op feeding into yield op of an
-// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
-// advanced cases where the distributed dimension is partially extracted and
-// currently not supported by the generic vector distribution patterns.
-struct VectorExtractStridedSliceDistribution
- : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand =
- getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
- if (!operand)
- return failure();
- auto extractOp =
- cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
- unsigned operandIdx = operand->getOperandNumber();
- auto distributedType =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
- // Find the distributed dimensions.
- auto extractResultType = cast<VectorType>(operand->get().getType());
- auto distributedDims =
- getDistributedDims(extractResultType, distributedType);
- // Collect updated source type, sizes and offsets. They may be adjusted
- // later if the data is distributed to lanes (as opposed to being owned by
- // all lanes uniformly).
- VectorType updatedSourceType = extractOp.getSourceVectorType();
- SmallVector<Attribute> updatedSizes = llvm::map_to_vector(
- extractOp.getSizes(), [](Attribute attr) { return attr; });
- SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
- extractOp.getOffsets(), [](Attribute attr) { return attr; });
- SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
- extractOp.getStrides(), [](Attribute attr) { return attr; });
- // If the provided sizes, offsets, strides are less than the rank, pad them
- // with full sizes, zero offsets, and unit strides. This makes it easier to
- // adjust them later.
- int64_t sourceRank = extractOp.getSourceVectorType().getRank();
- for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) {
- updatedSizes.push_back(rewriter.getI64IntegerAttr(
- extractOp.getSourceVectorType().getDimSize(i)));
- updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
- updatedStrides.push_back(
- rewriter.getI64IntegerAttr(1)); // stride is always 1.
- }
- // If the result is distributed, it must be distributed in exactly one
- // dimension. In this case, we adjust the sourceDistType, distributedSizes
- // and distributedOffsets accordingly.
- if (distributedDims.size() > 0) {
- if (distributedDims.size() != 1)
- return rewriter.notifyMatchFailure(
- warpOp, "Source can not be distributed in multiple dimensions.");
- int64_t distributedDim = distributedDims[0];
- int sourceDistrDimSize =
- extractOp.getSourceVectorType().getShape()[distributedDim];
- auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
- if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
- return rewriter.notifyMatchFailure(
- warpOp, "the source of extract_strided_slice op lacks distribution "
- "layout");
- auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
- // Because only single dimension distribution is supported, lane layout
- // size at the distributed dim must be the subgroup size.
- int subgroupSize = sourceLaneLayout[distributedDim];
- // Check if the source size in the distributed dimension is a multiple of
- // subgroup size.
- if (sourceDistrDimSize % subgroupSize != 0)
- return rewriter.notifyMatchFailure(
- warpOp,
- "Source size along distributed dimension is not a multiple of "
- "subgroup size.");
- auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
- // We expect lane data to be all ones in this case.
- if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
- return rewriter.notifyMatchFailure(
- warpOp, "Expecting unit lane data in source layout");
- // The offsets in the distributed dimention must be a multiple of subgroup
- // size.
- int64_t distrDimOffset =
- cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt();
- if (distrDimOffset % subgroupSize != 0)
- return rewriter.notifyMatchFailure(
- warpOp, "Offset along distributed dimension "
- "is not a multiple of subgroup size.");
- updatedSourceType = getDistVecTypeBasedOnLaneLayout(
- sourceLayout, extractOp.getSourceVectorType())
- .value();
- // Update the distributed sizes to match the distributed type.
- updatedSizes[distributedDim] = rewriter.getI64IntegerAttr(
- distributedType.getDimSize(distributedDim));
- // Update the distributed offsets to match round robin distribution (i.e.
- // each lane owns data at `subgroupSize` stride given unit lane data).
- updatedOffsets[distributedDim] =
- rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
- }
- // Do the distribution by yielding the source of the extract op from
- // the warp op and creating a new extract op outside the warp op.
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType},
- newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- Value source = newWarpOp.getResult(newRetIndices[0]);
- // Create a new extract op outside the warp op.
- Value newExtractOp = vector::ExtractStridedSliceOp::create(
- rewriter, extractOp.getLoc(), distributedType, source,
- ArrayAttr::get(rewriter.getContext(), updatedOffsets),
- ArrayAttr::get(rewriter.getContext(), updatedSizes),
- ArrayAttr::get(rewriter.getContext(), updatedStrides));
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
- return success();
- }
-};
-
-/// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
-/// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
-/// advanced cases where the distributed dimension is partially inserted and
-/// currently not supported by the generic vector distribution patterns.
-struct VectorInsertStridedSliceDistribution
- : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
- // Check if the InsertStridedSliceOp is the last op before yield op
- return llvm::IsaPred<vector::InsertStridedSliceOp>(op) &&
- warpOp.getTerminator()->getPrevNode() == op;
- });
- if (!operand)
- return failure();
- unsigned int operandNumber = operand->getOperandNumber();
- auto insertOp =
- operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
- auto distributedType =
- cast<VectorType>(warpOp.getResult(operandNumber).getType());
- // Find the distributed dimensions of the dest vector.
- auto insertResultType = cast<VectorType>(operand->get().getType());
- auto destDistributedDims =
- getDistributedDims(insertResultType, distributedType);
- // Collect updated offsets, source type and dest type. They may be adjusted
- // later if the data is distributed to lanes (as opposed to being owned by
- // all lanes uniformly).
- SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
- insertOp.getOffsets(), [](Attribute attr) { return attr; });
- VectorType updatedSourceType = insertOp.getSourceVectorType();
- VectorType updatedDestType = insertOp.getDestVectorType();
- if (destDistributedDims.size() > 0) {
- // Only single dimension distribution is supported.
- if (destDistributedDims.size() != 1)
- return rewriter.notifyMatchFailure(
- warpOp,
- "Expecting source to be distributed in a single dimension.");
- int64_t destDistributedDim = destDistributedDims[0];
-
- VectorType srcType = insertOp.getSourceVectorType();
- VectorType destType = insertOp.getDestVectorType();
- // Currently we require that both source (kD) and dest (nD) vectors are
- // distributed. This requires that distributedDim (d) is contained in the
- // last k dims of the dest vector (d >= n - k).
- int64_t sourceDistributedDim =
- destDistributedDim - (destType.getRank() - srcType.getRank());
- if (sourceDistributedDim < 0)
- return rewriter.notifyMatchFailure(
- insertOp,
- "distributed dimension must be in the last k (i.e. source "
- "rank) dims of dest vector");
- int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
- // Obtain the source and dest layouts.
- auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
- auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
- if (!destLayout || !sourceLayout ||
- destLayout.getEffectiveLaneLayoutAsInt().empty() ||
- sourceLayout.getEffectiveLaneLayoutAsInt().empty())
- return rewriter.notifyMatchFailure(
- warpOp, "the source or dest of insert_strided_slice op lacks "
- "distribution layout");
- // Because only single dimension distribution is supported, lane layout
- // size at the distributed dim must be the subgroup size.
- int subgroupSize =
- destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
- // We require that source and dest lane data are all ones to ensure
- // uniform round robin distribution.
- auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
- auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
- if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
- !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
- return rewriter.notifyMatchFailure(
- warpOp, "Expecting unit lane data in source and dest layouts");
- // Source distributed dim size must be multiples of subgroup size.
- if (srcDistrDimSize % subgroupSize != 0)
- return rewriter.notifyMatchFailure(
- warpOp, "Distributed dimension size in source is not a multiple of "
- "subgroup size.");
- // Offsets in the distributed dimension must be multiples of subgroup
- // size.
- int64_t destDistrDimOffset =
- cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
- if (destDistrDimOffset % subgroupSize != 0)
- return rewriter.notifyMatchFailure(
- warpOp,
- "Offset along distributed dimension in dest is not a multiple of "
- "subgroup size.");
- // Update the source and dest types based on their layouts.
- updatedSourceType = getDistVecTypeBasedOnLaneLayout(
- sourceLayout, insertOp.getSourceVectorType())
- .value();
- updatedDestType = getDistVecTypeBasedOnLaneLayout(
- destLayout, insertOp.getDestVectorType())
- .value();
- // Update the distributed offsets to match round robin distribution (i.e.
- // each lane owns data at `subgroupSize` stride given unit lane data).
- updatedOffsets[destDistributedDim] =
- rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
- }
- // Do the distribution by yielding the source and dest of the insert op
- // from the warp op and creating a new insert op outside the warp op.
- SmallVector<size_t> newRetIndices;
- auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
- {updatedSourceType, updatedDestType}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
-
- Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
- Value dest = newWarpOp.getResult(newRetIndices[1]);
- // Create a new insert op outside the warp op.
- Value newInsertOp = vector::InsertStridedSliceOp::create(
- rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest,
- ArrayAttr::get(rewriter.getContext(), updatedOffsets),
- insertOp.getStrides());
- rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
- newInsertOp);
- return success();
- }
-};
-
-/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
-/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
-/// outside of the warp op.
-struct MemrefExtractAlignedPointerAsIndexDistribution final
- : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand = getWarpResult(
- warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp,
- "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
- auto extractOp =
- operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
- unsigned operandIdx = operand->getOperandNumber();
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, extractOp.getSource(),
- TypeRange{extractOp.getSource().getType()}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
- rewriter, newWarpOp.getLoc(), extractOp.getType(),
- newWarpOp.getResult(newRetIndices[0]));
- Value resultVal = newWarpOp.getResult(operandIdx);
- rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult());
- return success();
- }
-};
-
-/// Distribute a vector::BitCastOp feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
-/// diemension of the source/result vectors. Equivalent vector::BitCastOp is
-/// created outside of the warp op with distributed source vector type (computed
-/// using assigned layout).
-struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand =
- getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp, "warp result is not a vector::BitCast op");
- auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
- unsigned operandIdx = operand->getOperandNumber();
- VectorType distributedSourceType =
- getDistVecTypeBasedOnLaneLayout(
- xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
- bitcastOp.getSourceVectorType())
- .value_or(VectorType());
- if (!distributedSourceType)
- return rewriter.notifyMatchFailure(
- bitcastOp, "Failed to distribute the source vector type in "
- "vector::BitCast op");
- VectorType distributedResultType =
- cast<VectorType>(warpOp.getResult(operandIdx).getType());
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, bitcastOp.getSource(),
- TypeRange{distributedSourceType}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- auto newBitcastOp = vector::BitCastOp::create(
- rewriter, newWarpOp.getLoc(), distributedResultType,
- newWarpOp.getResult(newRetIndices[0]));
- Value distributedVal = newWarpOp.getResult(operandIdx);
- rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
- return success();
- }
-};
-
-/// Distribute a vector::TransposeOp feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
-/// supported. In most cases, transpose is a no op because it is entirely
-/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
-/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
-/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
-/// vector::TransposeOp outside of the warp op with distributed source vector
-/// type (computed using assigned layout).
-struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand =
- getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp, "warp result is not a vector::Transpose op");
- auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
- unsigned operandIdx = operand->getOperandNumber();
- xegpu::DistributeLayoutAttr sourceLayout =
- xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
- xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
- if (!sourceLayout || !resultLayout)
- return rewriter.notifyMatchFailure(
- transposeOp,
- "the source or result vector of the transpose op lacks layout "
- "attribute");
- int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
- int64_t resultRank = transposeOp.getResultVectorType().getRank();
- // Only 2D transposes are supported for now.
- // TODO: Support nD transposes.
- if (sourceRank != 2 || resultRank != 2)
- return rewriter.notifyMatchFailure(
- transposeOp, "the source or result vector of the transpose op "
- "does not have 2D layout");
- ArrayRef<int64_t> perm = transposeOp.getPermutation();
- // Result layout must be a transpose of source layout.
- if (!resultLayout.isTransposeOf(sourceLayout, perm,
- xegpu::LayoutKind::Lane))
- return rewriter.notifyMatchFailure(
- transposeOp,
- "the source or result vector layouts must be 2D transposes of each "
- "other");
- FailureOr<VectorType> distributedSourceTypeOrFailure =
- getDistVecTypeBasedOnLaneLayout(sourceLayout,
- transposeOp.getSourceVectorType());
- if (failed(distributedSourceTypeOrFailure))
- return rewriter.notifyMatchFailure(
- transposeOp, "Failed to distribute the source vector type in "
- "vector::Transpose op");
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, transposeOp.getVector(),
- TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- auto newTransposeOp = vector::TransposeOp::create(
- rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
- perm);
- Value distributedVal = newWarpOp.getResult(operandIdx);
- rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
- return success();
- }
-};
-
-/// Distribute a vector::StepOp with the sliced result layout.
-/// The sliced layout must have exactly 1 effective lane dimension.
-/// We completely resolve the vector::StepOp by computing the lane_data-sized
-/// subranges.
-struct VectorStepSliceDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<vector::StepOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp, "warp result is not a vector::StepOp op");
- auto stepOp = operand->get().getDefiningOp<vector::StepOp>();
- unsigned operandIdx = operand->getOperandNumber();
- xegpu::DistributeLayoutAttr resultLayout =
- xegpu::getTemporaryLayout(stepOp->getResult(0));
- if (!resultLayout)
- return rewriter.notifyMatchFailure(
- stepOp, "the result vector of the step op lacks layout "
- "attribute");
- auto sliceLayout = dyn_cast<xegpu::SliceAttr>(resultLayout);
- if (!sliceLayout)
- return rewriter.notifyMatchFailure(
- stepOp, "the result layout must be a slice layout");
- if (sliceLayout.getEffectiveLaneLayoutAsInt().size() != 1)
- return rewriter.notifyMatchFailure(
- stepOp, "expecting 1 dim in the effective result layout");
-
- rewriter.setInsertionPointAfter(warpOp);
- auto loc = stepOp.getLoc();
- auto stepResultVecTy = stepOp.getResult().getType();
- Value distributedVal = warpOp.getResult(operandIdx);
- VectorType newVecTy = cast<VectorType>(distributedVal.getType());
-
- auto laneDataBlockCoords = resultLayout.computeDistributedCoords(
- rewriter, loc, warpOp.getLaneid(), stepResultVecTy.getShape());
- if (failed(laneDataBlockCoords))
- return rewriter.notifyMatchFailure(
- stepOp, "failed to compute lane data block coordinates");
-
- auto laneDataBlockCoordsVec = laneDataBlockCoords.value();
- auto laneDataBlockLength = resultLayout.getEffectiveLaneDataAsInt()[0];
- assert(static_cast<int64_t>(laneDataBlockCoordsVec.size()) ==
- newVecTy.getNumElements() / laneDataBlockLength);
- SmallVector<Value> stepVals;
- // For each lane_data block, reconstruct its sub-range
- // from the range of SG-level vector.step. Example: vector.step
- // {slice<layout<lane_layout=[2,4,2], lane_data=[1,2,1]>, dims=[0,2]>} :
- // vector<16xindex>
- // Each logical lane holds 4 elements as 2 blocks of 2 elements each.
- // The blocks are round-robin distributed, so logical lane id 0
- // holds values [0,1, 8,9].
- for (auto &laneDataBlockCoords : laneDataBlockCoordsVec) {
- auto laneDataBlockStartCoord = laneDataBlockCoords[0];
- stepVals.push_back(laneDataBlockStartCoord);
- for (int i = 1; i < laneDataBlockLength; ++i) {
- auto offset = arith::ConstantIndexOp::create(rewriter, loc, i);
- stepVals.push_back(arith::AddIOp::create(
- rewriter, loc, laneDataBlockStartCoord, offset));
- }
- }
- assert(static_cast<int64_t>(stepVals.size()) == newVecTy.getNumElements() &&
- "Expecting the number of step values to match the number of "
- "elements in the vector");
- auto stepOpVal =
- vector::FromElementsOp::create(rewriter, loc, newVecTy, stepVals);
- rewriter.replaceAllUsesWith(distributedVal, stepOpVal);
- return success();
- }
-};
-
-struct ConvertLayoutDistribution
- : public OpRewritePattern<xegpu::ConvertLayoutOp> {
- using OpRewritePattern::OpRewritePattern;
-
- LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
- PatternRewriter &rewriter) const override {
- auto inputLayout = op.getInputLayoutAttr();
- auto targetLayout = op.getTargetLayoutAttr();
- Type valType = op.getResult().getType();
-
- if (!inputLayout || !targetLayout)
- return rewriter.notifyMatchFailure(op, "missing layout attributes");
-
- if (valType.isIntOrFloat()) {
- rewriter.replaceOp(op, op.getSource());
- return success();
- }
- auto resShape = cast<VectorType>(valType).getShape();
- SmallVector<int64_t> resShapeVec(resShape.begin(), resShape.end());
- if (!inputLayout.isCompatibleWith(targetLayout, resShapeVec,
- xegpu::LayoutKind::Lane)) {
- return rewriter.notifyMatchFailure(
- op, "lowering incompatible convert_layout not yet supported");
- }
- rewriter.replaceOp(op, op.getSource());
- return success();
- }
-};
-
-} // namespace
-
-namespace {
-struct XeGPUSubgroupDistributePass final
- : public xegpu::impl::XeGPUSubgroupDistributeBase<
- XeGPUSubgroupDistributePass> {
- void runOnOperation() override;
-};
-} // namespace
-
-void xegpu::populateXeGPUSubgroupDistributePatterns(
- RewritePatternSet &patterns) {
- patterns.add<CreateNdDescDistribution, StoreNdDistribution,
- LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
- GpuBarrierDistribution, VectorMultiReductionDistribution,
- LoadDistribution, StoreDistribution, VectorTransposeDistribution,
- VectorBitcastDistribution, LoadMatrixDistribution,
- StoreMatrixDistribution, ConvertLayoutDistribution,
- MemrefExtractAlignedPointerAsIndexDistribution>(
- patterns.getContext(),
- /*pattern benefit=*/PatternHierarchy::Regular);
- // For following patterns, we need to override the regular vector distribution
- // patterns. Therefore, assign higher benefit.
- patterns
- .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
- VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
- VectorStepSliceDistribution, SinkUniformOps>(
- patterns.getContext(),
- /*pattern benefit=*/PatternHierarchy::AboveRegular);
-}
-
-void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
- RewritePatternSet &patterns) {
- patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
-}
-
-void XeGPUSubgroupDistributePass::runOnOperation() {
- // Step 1: Attach layouts to op operands.
- // TODO: Following assumptions are made:
- // 1) It is assumed that there are no layout conflicts.
- // 2) Any existing layout attributes attached to the operands are ignored.
- Operation *op = getOperation();
- if (!xegpu::recoverTemporaryLayouts(op)) {
- signalPassFailure();
- return;
- }
-
- // Step 2: Move all operations of a GPU function inside
- // gpu.warp_execute_on_lane_0 operation.
- {
- RewritePatternSet patterns(&getContext());
- xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
-
- if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
- signalPassFailure();
- return;
- }
- // At this point, we have moved the entire function body inside the
- // warpOp. Now move any scalar uniform code outside of the warpOp (like
- // GPU index ops, scalar constants, etc.). This will simplify the
- // later lowering and avoid custom patterns for these ops.
- getOperation()->walk([&](Operation *op) {
- if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
- vector::moveScalarUniformCode(warpOp);
- });
- }
- // Step 3: Apply subgroup to workitem distribution patterns.
- RewritePatternSet patterns(&getContext());
- xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
- // distributionFn is used by vector distribution patterns to determine the
- // distributed vector type for a given vector value. In XeGPU subgroup
- // distribution context, we compute this based on lane layout.
- auto distributionFn = [](Value val) {
- VectorType vecType = dyn_cast<VectorType>(val.getType());
- int64_t vecRank = vecType ? vecType.getRank() : 0;
- if (vecRank == 0)
- return AffineMap::get(val.getContext());
- // Get the layout of the vector type.
- xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
- // If no layout is specified, assume uniform case (no distribution).
- if (!layout)
- return AffineMap::get(val.getContext());
- // Expecting vector and layout rank to match.
- assert(layout.getRank() == vecRank &&
- "Expecting vector and layout rank to match");
- // A dimension is distributed only if layout suggests there are
- // multiple lanes assigned for this dimension and the shape can be evenly
- // distributed to those lanes.
- SmallVector<unsigned int> distributedDims;
- for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
- if (v > 1 && vecType.getShape()[i] % v == 0)
- distributedDims.push_back(i);
- }
- return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
- val.getContext());
- };
- // TODO: shuffleFn is not used.
- auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
- int64_t warpSz) { return Value(); };
-
- vector::populateDistributeReduction(
- patterns, xegpu::subgroupReduction,
- /*pattern benefit=*/PatternHierarchy::Regular);
-
- vector::populatePropagateWarpVectorDistributionPatterns(
- patterns, distributionFn, shuffleFn,
- /*pattern benefit=*/PatternHierarchy::Regular);
- if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
- signalPassFailure();
- return;
- }
-
- // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
- // due to tensor desc type mismatches created by using upstream distribution
- // patterns (scf.for). This cleanup should only be done if all the ops are
- // distributed successfully, if some ops are still not distributed and remains
- // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
- // breaking the IR.
- bool foundWarpOp = false;
- getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
- // Look for WarpOps that are not trivially dead.
- if (isOpTriviallyDead(warpOp))
- return WalkResult::advance();
- foundWarpOp = true;
- return WalkResult::interrupt();
- });
- if (foundWarpOp)
- return;
-
- getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
- // We are only interested in UnrealizedConversionCastOps there were added
- // for resolving SIMT type mismatches.
- if (!op->getAttr(resolveSIMTTypeMismatch))
- return WalkResult::skip();
-
- Value input = op.getOperand(0);
- Value output = op.getResult(0);
-
- // Both input and output must have tensor descriptor types.
- xegpu::TensorDescType inputDescType =
- mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
- xegpu::TensorDescType outputDescType =
- mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
- assert(inputDescType && outputDescType &&
- "Unrealized conversion cast must have tensor descriptor types");
-
- // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
- // This occurs inside scf.for body to resolve the block argument type to
- // SIMT type.
- if (inputDescType.getLayout()) {
- auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
- if (argument) {
- argument.setType(output.getType());
- output.replaceAllUsesWith(argument);
- if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
- argument.getOwner()->getParentOp())) {
- auto result = loopOp.getTiedLoopResult(argument);
- result.setType(output.getType());
- }
- }
- }
-
- // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
- // conversions. This occurs at the yield op of scf.for body to go back
- // from SIMT type to original type.
- if (outputDescType.getLayout())
- output.replaceAllUsesWith(input);
-
- if (op->use_empty())
- op->erase();
- return WalkResult::advance();
- });
-
- xegpu::removeTemporaryLayoutAttrs(getOperation());
-}
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 9c2d8e6aa5247..0fb0ac6e3416d 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -717,8 +717,6 @@ Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
- // TODO: Remove these get/setTemporaryLayout calls after we deprecate the old
- // XeGPUSubgroupDistribute pass.
auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
// Reduction result should have the same layout as the accumulator.
diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
deleted file mode 100644
index 3c2d987039840..0000000000000
--- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file %s | FileCheck %s
-
-gpu.module @test {
-gpu.func @empty() {
- gpu.return
-}
-}
-
-// CHECK-LABEL: gpu.func @empty()
-// CHECK-NEXT: gpu.return
-
-// -----
-gpu.module @test {
-gpu.func @gemm(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
- %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
- %2 = xegpu.load_nd %0[%c0, %c0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
- %3 = xegpu.load_nd %1[%c0, %c0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
- %4 = xegpu.dpas %2, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
- xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
- gpu.return
-}
-}
-
-// CHECK-LABEL: gpu.func @gemm(
-// CHECK: %[[ARG0:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<16x16xf16>,
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: memref<8x16xf32>)
-// CHECK: %[[LANEID:.*]] = gpu.lane_id
-// CHECK-NEXT: gpu.warp_execute_on_lane_0(%[[LANEID]])[16]
-// CHECK-SAME: args(%[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>)
-// CHECK: ^bb0(%[[ARG3:[a-zA-Z0-9]+]]: memref<8x16xf16>, %[[ARG4:[a-zA-Z0-9]+]]: memref<16x16xf16>,
-// CHECK-SAME: %[[ARG5:[a-zA-Z0-9]+]]: memref<8x16xf32>):
-// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG3]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG4]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T1]][{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[T2]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
-// CHECK-NEXT: %[[T5:.*]] = xegpu.dpas %[[T3]], %[[T4]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-// CHECK-NEXT: %[[T6:.*]] = xegpu.create_nd_tdesc %[[ARG5]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T5]], %[[T6]][%{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-// CHECK: gpu.return
-
-// -----
-gpu.module @test {
-gpu.func @already_in_warp_op() {
- %laneid = gpu.lane_id
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- "test.unknown"() : () -> ()
- gpu.yield
- }
- gpu.return
-}
-}
-
-// CHECK-LABEL: gpu.func @already_in_warp_op()
-// CHECK: %[[LANEID:.*]] = gpu.lane_id
-// CHECK: gpu.warp_execute_on_lane_0(%[[LANEID]])[16]
-// CHECK: "test.unknown"() : () -> ()
-// CHECK: gpu.return
-
-// -----
-gpu.module @test {
-"gpu.func"() ({
-^bb0:
- "test.unknown"() : () -> ()
-}) {function_type = () -> (), kernel, sym_name = "missing_return_terminator"} : () -> ()
-}
-
-// Regression test for MoveFuncBodyToWarpOp on malformed generic gpu.func.
-// CHECK-LABEL: gpu.func @missing_return_terminator
-// CHECK-NEXT: "test.unknown"() : () -> ()
-
-// -----
-
-gpu.module @test {
- gpu.func @multiple_blocks(%cond: i1) {
- cf.cond_br %cond, ^bb1, ^bb2
- ^bb1: // pred: ^bb0
- "test.unknown"() : () -> ()
- cf.br ^bb2
- ^bb2: // 2 preds: ^bb0, ^bb1
- gpu.return
- }
-}
-
-// CHECK-LABEL: gpu.func @multiple_blocks
-// CHECK-SAME: %[[COND:.*]]: i1
-// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2
-// CHECK: ^bb1:
-// CHECK-NEXT: "test.unknown"() : () -> ()
-// CHECK-NEXT: cf.br ^bb2
-// CHECK: ^bb2:
-// CHECK-NEXT: gpu.return
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
similarity index 99%
rename from mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
rename to mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
index f1e56f4493ec7..7f52391bd7928 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute-unit.mlir
@@ -1,6 +1,6 @@
// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=cri' --allow-unregistered-dialect \
-// RUN: --test-xegpu-sg-to-wi-distribute-experimental --split-input-file %s | FileCheck %s
+// RUN: --test-xegpu-sg-to-lane-distribute --split-input-file %s | FileCheck %s
gpu.module @xevm_module {
diff --git a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
similarity index 99%
rename from mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
rename to mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
index c8a9530641951..01f0c1e3e950e 100644
--- a/mlir/test/Dialect/XeGPU/sg-to-wi-experimental.mlir
+++ b/mlir/test/Dialect/XeGPU/sg-to-lane-distribute.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt --allow-unregistered-dialect --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: --xegpu-sg-to-wi-distribute-experimental --split-input-file %s --canonicalize --cse | FileCheck %s
+// RUN: --xegpu-sg-to-lane-distribute --split-input-file %s --canonicalize --cse | FileCheck %s
// CHECK-LABEL: gpu.func @gemm
// CHECK-DAG : %[[C0:.*]] = arith.constant 0 : index
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
deleted file mode 100644
index 8ab627a95e0a1..0000000000000
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ /dev/null
@@ -1,1271 +0,0 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute \
-// RUN: -allow-unregistered-dialect -canonicalize -cse %s | FileCheck %s
-gpu.module @xevm_module{
-// CHECK-LABEL: gpu.func @store_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
-// CHECK-SAME: -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
-// CHECK: gpu.yield %{{.*}} : vector<16xf32>,
-// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
-// CHECK-NEXT: xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.func @store_nd_1d(%laneid: index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %cst = "some_op"() : () -> vector<16xf32>
- xegpu.store_nd %cst, %0 [%c0] {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- }
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @store_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
-// CHECK-SAME: -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16>
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT: xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.func @store_nd_2d(%laneid : index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %cst = "some_op"() : () -> vector<16x16xf16>
- xegpu.store_nd %cst, %0 [%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- }
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @load_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>,
-// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
-// CHECK: gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
-// CHECK-NEXT: xegpu.load_nd %[[T1]][%[[W]]#2] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-gpu.func @load_nd_1d(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %1 = xegpu.load_nd %0 [%c0] {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
- !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
- gpu.yield %1 : vector<16xf32>
- }
- "some_user_op"(%r) : (vector<1xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @load_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16>
-gpu.func @load_nd_2d(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
- gpu.yield %1 : vector<16x16xf16>
- }
- "some_user_op"(%r) : (vector<16x1xf16>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @load_nd_array_length
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<32x1xf16>,
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK: gpu.yield %{{.*}} : vector<32x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<
-// CHECK-SAME: array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16],
-// CHECK-SAME: lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
-// CHECK-NEXT: vector.shape_cast %[[T2]] : vector<32xf16> to vector<32x1xf16>
-gpu.func @load_nd_array_length(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<32x1xf16>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x16xf16>
- gpu.yield %1 : vector<32x16xf16>
- }
- "some_user_op"(%r) : (vector<32x1xf16>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @dpas
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] ->
-// CHECK-SAME: (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
-// CHECK: gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
-// CHECK-NEXT: }
-// CHECK-DAG: %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16>
-// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16>
-// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT: vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-gpu.func @dpas(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
- %0 = "some_op"() : () -> vector<8x16xf16>
- %1 = "some_op"() : () -> vector<16x16xf16>
- %2 = "some_op"() : () -> vector<8x16xf32>
- %3 = xegpu.dpas %0, %1, %2
- {
- layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
- gpu.yield %3 : vector<8x16xf32>
- }
- "some_user_op"(%r) : (vector<8x1xf32>) -> ()
- gpu.return
-}
-
-
-
-// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) {
-// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-NEXT: builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch}
-gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
- %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
- !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- }
- "some_user_op"(%r)
- : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @prefetch_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
-// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK-SAME: , index, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2]
-// CHECK-SAME: <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.func @prefetch_2d(%laneid: index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : ()
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.prefetch_nd %0[%c0, %c0]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- }
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @prefetch_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
-// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16,
-// CHECK-SAME: #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch}
-// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>,
-// CHECK-SAME: l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.func @prefetch_1d(%laneid: index) {
- %c0 = arith.constant 0 : index
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %0 = "some_op"() : ()
- -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- xegpu.prefetch_nd %0[%c0]
- {layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
- : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- }
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
-// CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK: gpu.yield %{{.*}}
-// CHECK: }
-// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
-// CHECK: gpu.barrier
-gpu.func @gpu_barrier(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
- %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- %1 = xegpu.load_nd %0[%c0]
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
- : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
- gpu.barrier
- gpu.yield %1 : vector<16xf16>
- }
- "some_user_op"(%r) : (vector<1xf16>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
-// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) {
-// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32>
-// CHECK: gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32>
-// CHECK-NEXT: }
-// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
-// CHECK: %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
-// CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
-// CHECK: %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
-// CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32>
-gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<16x32xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
- dense<0.0> : vector<32xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
- } [0]
- : vector<16x32xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
- }
- "some_user_op"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
-// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
-// CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT: %[[T5:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<2x16xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
- dense<0.0> : vector<2xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
- }
- [1] : vector<2x16xf32> to vector<2xf32>
- gpu.yield %1 : vector<2xf32>
- }
- "some_user_op"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
-// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
-// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32>
-// CHECK: gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32>
-// CHECK: }
-// CHECK: %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
-// CHECK: %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
-// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32
-// CHECK: %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK: %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
-// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32
-// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
-gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : () -> (vector<32x16xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
- dense<0.0> : vector<32xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
- }
- [1] : vector<32x16xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
- }
- "some_user_op"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK: %[[SRC:.*]] = "some_def"()
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-// CHECK-SAME: : () -> vector<16x2xf32>
-// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]]
-// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T3:.*]] = vector.reduction <add>, %[[T2]], %[[CST]] : vector<16xf32> into f32
-// CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]]
-// CHECK-SAME: {layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
-// CHECK-SAME: layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-// CHECK-SAME: : vector<16x1xf32> to vector<16xf32>
-// CHECK: %[[T6:.*]] = vector.reduction <add>, %[[T5]], %[[CST]] : vector<16xf32> into f32
-gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
- : () -> (vector<16x2xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
- dense<0.0> : vector<2xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
- }
- [0] : vector<16x2xf32> to vector<2xf32>
- gpu.yield %1 : vector<2xf32>
- }
- "some_user_op"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_3d_leading_unit_dim
-// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<1x32xf32>
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: -> (vector<1x2xf32>, vector<1x16x2xf32>, vector<1x2xf32>) {
-// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<1x16x32xf32>
-// CHECK: gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<1x32xf32>, vector<1x16x32xf32>, vector<1x32xf32>
-// CHECK-NEXT: }
-// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [0, 0, 0], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<1x16x1xf32> to vector<16xf32>
-// CHECK: %[[T3:.*]] = vector.extract %[[W]]#2[0, 0] : f32 from vector<1x2xf32>
-// CHECK: %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
-// CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [0, 0, 1], sizes = [1, 16, 1], strides = [1, 1, 1]} : vector<1x16x2xf32> to vector<1x16x1xf32>
-// CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<1x16x1xf32> to vector<16xf32>
-// CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[0, 1] : f32 from vector<1x2xf32>
-// CHECK: %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
-// CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<1x2xf32>
-gpu.func @vector_multi_reduction_3d_leading_unit_dim(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x2xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- : () -> (vector<1x16x32xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>}
- dense<0.0> : vector<1x32xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>
- }
- [1] : vector<1x16x32xf32> to vector<1x32xf32>
- gpu.yield %1 : vector<1x32xf32>
- }
- "some_user_op"(%r) : (vector<1x2xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_multi_reduction_3d_trivial_reduction
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: -> (vector<1x1xf32>, vector<1x1x1xf32>, vector<1x1xf32>) {
-// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<1x1x16xf32>
-// CHECK: gpu.yield %{{.*}}, %[[SRC]], %{{.*}} : vector<1x16xf32>, vector<1x1x16xf32>, vector<1x16xf32>
-// CHECK-NEXT: }
-// CHECK: %[[A:.*]] = vector.extract %[[W]]#2[0, 0] : f32 from vector<1x1xf32>
-// CHECK: %[[S:.*]] = vector.extract %[[W]]#1[0, 0, 0] : f32 from vector<1x1x1xf32>
-// CHECK: %[[ADD:.*]] = arith.addf %[[S]], %[[A]] : f32
-// CHECK: %[[BC:.*]] = vector.broadcast %[[ADD]] : f32 to vector<1x1xf32>
-gpu.func @vector_multi_reduction_3d_trivial_reduction(%laneid: index) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
- %src = "some_def"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- : () -> (vector<1x1x16xf32>)
- %acc = arith.constant
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>}
- dense<0.0> : vector<1x16xf32>
- %1 = vector.multi_reduction <add>, %src, %acc
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>,
- layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [1]>
- }
- [1] : vector<1x1x16xf32> to vector<1x16xf32>
- gpu.yield %1 : vector<1x16xf32>
- }
- "some_user_op"(%r) : (vector<1x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
-// CHECK: %[[OFFSETS:.*]] = arith.constant dense<12> : vector<16xindex>
-// CHECK: %[[MASKS:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
-// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] :
-// CHECK-SAME: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
-// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
-// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %1 = arith.constant dense<1>: vector<16xi1>
- %offset = arith.constant dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1 <{chunk_size=8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
- : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- xegpu.store %3, %src[%offset], %1 <{chunk_size=8, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}>
- : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- }
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK: %[[OFFSETS:.*]] = arith.constant dense<12> : vector<16xindex>
-// CHECK: %[[MASKS:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
-// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
-// CHECK-SAME: : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3
-// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
-// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %1 = arith.constant dense<1> : vector<16xi1>
- %offset = arith.constant dense<12> : vector<16xindex>
- %3 = xegpu.load %src[%offset], %1
- {
- layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
- xegpu.store %3, %src[%offset], %1
- {
- layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- }
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @scatter_ops_with_leading_dims({{.*}}) {
-// CHECK: %[[OFFSETS:.*]] = arith.constant dense<12> : vector<1x1x16xindex>
-// CHECK: %[[MASKS:.*]] = arith.constant dense<true> : vector<1x1x16xi1>
-// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: -> (vector<1x1x1xf16>, memref<256xf16>, vector<1x1x1xindex>, vector<1x1x1xi1>) {
-// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
-// CHECK-SAME: : vector<1x1x16xf16>, memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[V1:.*]] = vector.shape_cast %[[W]]#2 : vector<1x1x1xindex> to vector<1xindex>
-// CHECK-NEXT: %[[V2:.*]] = vector.shape_cast %[[W]]#3 : vector<1x1x1xi1> to vector<1xi1>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[V1]]], %[[V2]]
-// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[V1]]], %[[V2]]
-// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.func @scatter_ops_with_leading_dims(%src: memref<256xf16>, %laneid: index) {
- gpu.warp_execute_on_lane_0(%laneid)[16] {
- %1 = arith.constant
- dense<1> : vector<1x1x16xi1>
- %offset = arith.constant
- dense<12> : vector<1x1x16xindex>
- %3 = xegpu.load %src[%offset], %1 {layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- : memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf16>
- xegpu.store %3, %src[%offset], %1 { layout = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>}
- : vector<1x1x16xf16>, memref<256xf16>, vector<1x1x16xindex>, vector<1x1x16xi1>
- }
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {
-// CHECK: gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
-// CHECK-NEXT: arith.index_cast %[[INTPTR]] : index to i64
-gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
- %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
- gpu.yield %ptr : index
- }
- %ptr_i64 = arith.index_cast %r : index to i64
- "some_user_op"(%ptr_i64) : (i64) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @memref_alloca(
-// CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() : memref<2048xi8, 3>
-// CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ALLOCA]] : memref<2048xi8, 3> -> index
-// CHECK-NEXT: %[[CAST:.*]] = arith.index_cast %[[INTPTR]] : index to i64
-gpu.func @memref_alloca(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (memref<2048xi8, 3>) {
- %alloca = memref.alloca() : memref<2048xi8, 3>
- gpu.yield %alloca : memref<2048xi8, 3>
- }
- %ptr = memref.extract_aligned_pointer_as_index %r : memref<2048xi8, 3> -> index
- %ptr_i64 = arith.index_cast %ptr : index to i64
- "some_user_op"(%ptr_i64) : (i64) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @create_memdesc(
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.mem_desc<4x128xf32>, memref<2048xi8, 3>) {
-// CHECK: gpu.yield %{{.*}}, %{{.*}} : !xegpu.mem_desc<4x128xf32>, memref<2048xi8, 3>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[MDesc:.*]] = xegpu.create_mem_desc %[[W]]#1 : memref<2048xi8, 3> -> !xegpu.mem_desc<4x128xf32>
-gpu.func @create_memdesc(%laneid: index, %arg0 : memref<2048xi8, 3>) {
- %c0 = arith.constant 0 : index
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.mem_desc<4x128xf32>) {
- %mdesc = xegpu.create_mem_desc %arg0 : memref<2048xi8, 3> -> !xegpu.mem_desc<4x128xf32>
- gpu.yield %mdesc : !xegpu.mem_desc<4x128xf32>
- }
- %25 = xegpu.load_matrix %r[%c0, %c0]: !xegpu.mem_desc<4x128xf32>, index, index -> vector<1x16xf32>
- "some_user_op"(%25) : (vector<1x16xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_transpose(
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) {
-// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32>
-// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
-gpu.func @vector_transpose(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
- : () -> (vector<16x2xf32>)
- %transpose = vector.transpose %cst, [1, 0]
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1], order = [0, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x2xf32> to vector<2x16xf32>
- gpu.yield %transpose : vector<2x16xf32>
- }
- "some_user_op"(%r) : (vector<2x1xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_bitcast(
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) {
-// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8>
-// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8>
-// CHECK: }
-// CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
-gpu.func @vector_bitcast(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
- : () -> (vector<4x32xi8>)
- %bitcast = vector.bitcast %cst
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<4x32xi8> to vector<4x16xi16>
- gpu.yield %bitcast : vector<4x16xi16>
- }
- "some_user_op"(%r) : (vector<4x1xi16>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing
-// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) {
-// CHECK: gpu.yield %{{.*}} : vector<1x16xf32>, vector<16xf32>
-// CHECK: }
-// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1xf32> to vector<1x1xf32>
-gpu.func @vector_shapecast_rank_increasing(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
- : () -> (vector<16xf32>)
- %cast = vector.shape_cast %cst
- {
- layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16xf32> to vector<1x16xf32>
- gpu.yield %cast : vector<1x16xf32>
- }
- "some_user_op"(%r) : (vector<1x1xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_shapecast_rank_reducing(
-// CHECK: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1xf32>, vector<1x1xf32>) {
-// CHECK: gpu.yield %{{.*}} : vector<16xf32>, vector<1x16xf32>
-// CHECK: }
-// CHECK: %{{.*}} = vector.shape_cast %{{.*}}#1 : vector<1x1xf32> to vector<1xf32>
-gpu.func @vector_shapecast_rank_reducing(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<1x16xf32>)
- %cast = vector.shape_cast %cst
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
- }
- : vector<1x16xf32> to vector<16xf32>
- gpu.yield %cast : vector<16xf32>
- }
- "some_user_op"(%r) : (vector<1xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_shapecast_rank_increasing_without_slicing_layout
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x1xf32>, vector<1xf32>) {
-// CHECK: %[[T1:.*]] = vector.shape_cast %{{.*}} {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf32> to vector<1x16xf32>
-// CHECK: gpu.yield %[[T1]], %{{.*}} : vector<1x16xf32>, vector<16xf32>
-// CHECK: }
-// CHECK: %{{.*}} = vector.shape_cast %[[W]]#1 : vector<1xf32> to vector<1x1xf32>
-// CHECK: gpu.return
-gpu.module @xevm_module{
-gpu.func @vector_shapecast_rank_increasing_without_slicing_layout(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x1xf32>) {
- %cst = "some_op"()
- {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]> }
- : () -> (vector<16xf32>)
- %cast = vector.shape_cast %cst
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16xf32> to vector<1x16xf32>
- gpu.yield %cast : vector<1x16xf32>
- }
- "some_user_op"(%r) : (vector<1x1xf32>) -> ()
- gpu.return
-}
-}
-
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted
-// CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_distributed_dim_fully_extracted(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
- %0 = "some_def"() : () -> (vector<24x16xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 16], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<24x16xf32> to vector<8x16xf32>
- gpu.yield %1 : vector<8x16xf32>
- }
- "some_use"(%r) : (vector<8x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_non_distributed
-// CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x1xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x1xf32>, vector<24x1xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_non_distributed(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
- %0 = "some_def"() : () -> (vector<24x1xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [8, 0], sizes = [8, 1], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<24x1xf32> to vector<8x1xf32>
- gpu.yield %1 : vector<8x1xf32>
- }
- "some_use"(%r) : (vector<8x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_inner_distributed
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x4xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x64xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x64xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [8, 3], sizes = [8, 1], strides = [1, 1]} : vector<24x4xf32> to vector<8x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_inner_distributed(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
- %0 = "some_def"() : () -> (vector<24x64xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [8, 48], sizes = [8, 16], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<24x64xf32> to vector<8x16xf32>
- gpu.yield %1 : vector<8x16xf32>
- }
- "some_use"(%r) : (vector<8x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_outer_distributed
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<1x16xf32>, vector<2x16xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<32x16xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<16x16xf32>, vector<32x16xf32>
-// CHECK: }
-// CHECK-NEXT: %[[T1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16xf32> to vector<1x16xf32>
-// CHECK-NEXT: "some_use"(%[[T2]]) : (vector<1x16xf32>) -> ()
-gpu.func @vector_extract_strided_slice_outer_distributed(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1x16xf32>) {
- %0 = "some_def"() : () -> (vector<32x16xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [16], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
- }
- : vector<32x16xf32> to vector<16x16xf32>
- gpu.yield %1 : vector<16x16xf32>
- }
- "some_use"(%r) : (vector<1x16xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_1d
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<4xf32>) {
-// CHECK: %[[S:.*]] = "some_def"() : () -> vector<64xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<32xf32>, vector<64xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [1], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<2xf32>) -> ()
-gpu.func @vector_extract_strided_slice_1d(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %0 = "some_def"() : () -> (vector<64xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [16], sizes = [32], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<64xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
- }
- "some_use"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_offset
-// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK: }
-// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
-gpu.func @vector_extract_strided_slice_unsopported_offset(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %0 = "some_def"() : () -> (vector<64xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [3], sizes = [32], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<64xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
- }
- "some_use"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_unsopported_source
-// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
-// CHECK: }
-// CHECK-NOT: %{{.*}} = vector.extract_strided_slice
-gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
- %0 = "some_def"() : () -> (vector<54xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [0], sizes = [32], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<54xf32> to vector<32xf32>
- gpu.yield %1 : vector<32xf32>
- }
- "some_use"(%r) : (vector<2xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets
-// CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
-// CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> ()
-gpu.func @vector_extract_strided_slice_partial_offsets(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
- %0 = "some_def"() : () -> (vector<24x16xf32>)
- %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<24x16xf32> to vector<8x16xf32>
- gpu.yield %1 : vector<8x16xf32>
- }
- "some_use"(%r) : (vector<8x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted
-// CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
-// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16x16xf32>, vector<64x16xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME: {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
-gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) {
- %0 = "some_def"() : () -> (vector<16x16xf32>)
- %1 = "some_def"() : () -> (vector<64x16xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x16xf32> into vector<64x16xf32>
- gpu.yield %2 : vector<64x16xf32>
- }
- "some_use"(%r) : (vector<64x1xf32>) -> ()
- gpu.return
-}
-
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_non_distributed
-// CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x1xf32>
-// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x1xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME: {offsets = [24, 0], strides = [1, 1]} : vector<16x1xf32> into vector<64x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
-gpu.func @vector_insert_strided_slice_non_distributed(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) {
- %0 = "some_def"() : () -> (vector<16x1xf32>)
- %1 = "some_def"() : () -> (vector<64x1xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 0], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x1xf32> into vector<64x1xf32>
- gpu.yield %2 : vector<64x1xf32>
- }
- "some_use"(%r) : (vector<64x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_inner_distributed
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x2xf32>, vector<16x1xf32>, vector<64x2xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
-// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x32xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x32xf32>, vector<16x16xf32>, vector<64x32xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME: {offsets = [24, 1], strides = [1, 1]} : vector<16x1xf32> into vector<64x2xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x2xf32>) -> ()
-gpu.func @vector_insert_strided_slice_inner_distributed(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x2xf32>) {
- %0 = "some_def"() : () -> (vector<16x16xf32>)
- %1 = "some_def"() : () -> (vector<64x32xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [24, 16], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16x16xf32> into vector<64x32xf32>
- gpu.yield %2 : vector<64x32xf32>
- }
- "some_use"(%r) : (vector<64x2xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_outer_distributed
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3x32xf32>, vector<1x16xf32>, vector<3x32xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16x16xf32>
-// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<48x32xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48x32xf32>, vector<16x16xf32>, vector<48x32xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME: {offsets = [2, 4], strides = [1, 1]} : vector<1x16xf32> into vector<3x32xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<3x32xf32>) -> ()
-gpu.func @vector_insert_strided_slice_outer_distributed(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3x32xf32>) {
- %0 = "some_def"() : () -> (vector<16x16xf32>)
- %1 = "some_def"() : () -> (vector<48x32xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [32, 4], strides = [1, 1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>
- }
- : vector<16x16xf32> into vector<48x32xf32>
- gpu.yield %2 : vector<48x32xf32>
- }
- "some_use"(%r) : (vector<3x32xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_1d
-// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>, vector<1xf32>, vector<3xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32>
-// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<48xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<48xf32>, vector<16xf32>, vector<48xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME: {offsets = [1], strides = [1]} : vector<1xf32> into vector<3xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<3xf32>) -> ()
-gpu.func @vector_insert_strided_slice_1d(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
- %0 = "some_def"() : () -> (vector<16xf32>)
- %1 = "some_def"() : () -> (vector<48xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<16xf32> into vector<48xf32>
- gpu.yield %2 : vector<48xf32>
- }
- "some_use"(%r) : (vector<3xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_
diff erent_ranks
-// CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<1xf32>, vector<64x1xf32>) {
-// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32>
-// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32>
-// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16xf32>, vector<64x16xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2
-// CHECK-SAME: {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32>
-// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> ()
-gpu.func @vector_insert_strided_slice_
diff erent_ranks(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) {
- %0 = "some_def"() : () -> (vector<16xf32>)
- %1 = "some_def"() : () -> (vector<64x16xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : vector<16xf32> into vector<64x16xf32>
- gpu.yield %2 : vector<64x16xf32>
- }
- "some_use"(%r) : (vector<64x1xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_source
-// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) {
-// CHECK: }
-// CHECK-NOT: %{{.*}} = vector.insert_strided_slice
-gpu.func @vector_insert_strided_slice_unsupported_source(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
- %0 = "some_def"() : () -> (vector<8xf32>)
- %1 = "some_def"() : () -> (vector<48xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [16], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<8xf32> into vector<48xf32>
- gpu.yield %2 : vector<48xf32>
- }
- "some_use"(%r) : (vector<3xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_offset
-// CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) {
-// CHECK: }
-// CHECK-NOT: %{{.*}} = vector.insert_strided_slice
-gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<3xf32>) {
- %0 = "some_def"() : () -> (vector<16xf32>)
- %1 = "some_def"() : () -> (vector<48xf32>)
- %2 = vector.insert_strided_slice %0, %1 { offsets = [3], strides = [1],
- layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
- }
- : vector<16xf32> into vector<48xf32>
- gpu.yield %2 : vector<48xf32>
- }
- "some_use"(%r) : (vector<3xf32>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane
-// CHECK-SAME: (%[[ARG0:.*]]: index) {
-// CHECK: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1x16x1xf16>, vector<1xf16>, vector<16x1xf16>)
-// CHECK: %[[DEF0:.*]] = "some_def"() : () -> vector<16xf16>
-// CHECK: %[[DEF1:.*]] = "some_def"() : () -> vector<16x16xf16>
-// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF0]]
-// CHECK: %[[CAST_INNER:.*]] = vector.shape_cast %[[DEF1]] : vector<16x16xf16> to vector<1x16x16xf16>
-// CHECK: gpu.yield %[[BCAST_INNER]], %[[CAST_INNER]], %[[DEF0]], %[[DEF1]]
-// CHECK: %[[CAST:.*]] = vector.shape_cast %[[R]]#3 : vector<16x1xf16> to vector<1x16x1xf16>
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#2 : vector<1xf16> to vector<16x1xf16>
-// CHECK: "some_use"(%[[BCAST]]) : (vector<16x1xf16>) -> ()
-// CHECK: "some_use"(%[[CAST]]) : (vector<1x16x1xf16>) -> ()
-gpu.func @vector_broadcast_1d_to_2d_to_3d_broadcast_within_lane(%laneid: index) {
-
- %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>, vector<1x16x1xf16>) {
-
- %1 = "some_def"() : () -> vector<16xf16>
- %3 = "some_def"() : () -> vector<16x16xf16>
-
- %2 = vector.broadcast %1 {
- layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- } : vector<16xf16> to vector<16x16xf16>
-
- %4 = vector.broadcast %3 {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>
- } : vector<16x16xf16> to vector<1x16x16xf16>
-
- gpu.yield %2, %4 : vector<16x16xf16>, vector<1x16x16xf16>
- }
- "some_use"(%r#0) : (vector<16x1xf16>) -> ()
- "some_use"(%r#1) : (vector<1x16x1xf16>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case
-// CHECK-SAME: (%[[ARG0:.*]]: index)
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<16x1xf16>)
-// CHECK: %[[DEF:.*]] = "some_def"() : () -> vector<16x1xf16>
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]]
-// CHECK-SAME: : vector<16x1xf16> to vector<16x16xf16>
-// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, vector<16x1xf16>
-// CHECK: "some_use"(%[[R]]#1) : (vector<16x1xf16>) -> ()
-gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: index) {
- %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
- %1 = "some_def"() : () -> vector<16x1xf16>
- %2 = vector.broadcast %1 {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- } : vector<16x1xf16> to vector<16x16xf16>
- gpu.yield %2: vector<16x16xf16>
- }
- "some_use"(%0) : (vector<16x1xf16>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector
-// CHECK-SAME: (%[[ARG0:.*]]: index)
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, f16)
-// CHECK: %[[DEF:.*]] = "some_def"()
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
-// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
-// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x1xf16>
-// CHECK: "some_use"(%[[RESULT]])
-gpu.func
- at vector_shape_cast_scalar_to_vector(%arg0: index) {
- %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
- %1 = "some_def"() : () -> f16
- %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
- gpu.yield %2 : vector<16x16xf16>
- }
- "some_use"(%0) : (vector<16x1xf16>) -> ()
- gpu.return
-}
-
-// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector_uniform
-// CHECK-SAME: (%[[ARG0:.*]]: index)
-// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x16xf16>, f16)
-// CHECK: %[[DEF:.*]] = "some_def"()
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] : f16 to vector<16x16xf16>
-// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
-// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x16xf16>
-// CHECK: "some_use"(%[[RESULT]])
- gpu.func @vector_shape_cast_scalar_to_vector_uniform(%arg0: index) {
- %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x16xf16>) {
- %1 = "some_def"() : () -> f16
- %2 = vector.broadcast %1 : f16 to vector<16x16xf16>
- gpu.yield %2 : vector<16x16xf16>
- }
- "some_use"(%0) : (vector<16x16xf16>) -> ()
- gpu.return
- }
-
-// CHECK-LABEL: gpu.func @vector_step_slice
-// CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[LANE_ID_IN_SLICED_DIM:.*]] = arith.remui %[[LANE_ID]], %c16 : index
-// CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM1:.*]] = arith.remui %[[LANE_ID_IN_SLICED_DIM]], %c16 : index
-// CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = vector.broadcast %[[LANE_ID_IN_SLICED_DIM1]] : index to vector<1xindex>
-// CHECK-NEXT: "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
- gpu.func @vector_step_slice(%arg0: index) {
- %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
- %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 2]>} : vector<16xindex>
- gpu.yield %5 : vector<16xindex>
- }
- "some_use"(%0) : (vector<1xindex>) -> ()
- gpu.return
- }
-
- // CHECK-LABEL: gpu.func @vector_step_slice_unit
- // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
- // CHECK-NEXT: %[[LANE_ID_IN_SLICED_DIM_VEC:.*]] = arith.constant dense<0> : vector<1xindex>
- // CHECK-NEXT: "some_use"(%[[LANE_ID_IN_SLICED_DIM_VEC]]) : (vector<1xindex>) -> ()
- gpu.func @vector_step_slice_unit(%arg0: index) {
- %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<1xindex>) {
- %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1, 16], lane_data = [1, 1, 1, 1]>, dims = [0, 1, 3]>} : vector<1xindex>
- gpu.yield %5 : vector<1xindex>
- }
- "some_use"(%0) : (vector<1xindex>) -> ()
- gpu.return
- }
-
- // CHECK-LABEL: gpu.func @vector_step_slice_multi_dist_unit
- // CHECK: (%[[LANE_ID:[0-9a-zA-Z]+]]: index) {
- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
- // CHECK-DAG: %[[DIST_UNIT_SIZE:.*]] = arith.constant 8 : index
- // CHECK-DAG: %[[SG_LEVEL_VECSIZE:.*]] = arith.constant 16 : index
- // CHECK-DAG: %[[LANE_LAYOUT:.*]] = arith.constant 4 : index
- // CHECK-DAG: %[[LANE_DATA:.*]] = arith.constant 2 : index
- // CHECK-DAG: %[[LANE_DIST_UNIT_START_IDX:.*]] = arith.divui %[[LANE_ID]], %[[LANE_DATA]] : index
- // CHECK-DAG: %[[DIST_UNIT_0_IDX:.*]] = arith.remui %[[LANE_DIST_UNIT_START_IDX]], %[[LANE_LAYOUT]] : index
- // CHECK-DAG: %[[DIST_UNIT_0_OFFSET:.*]] = arith.muli %[[DIST_UNIT_0_IDX]], %[[LANE_DATA]] : index
- // CHECK-DAG: %[[DIST_UNIT_0_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_0_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
- // CHECK-DAG: %[[DIST_UNIT_1_OFFSET:.*]] = arith.addi %[[DIST_UNIT_0_OFFSET]], %[[DIST_UNIT_SIZE]] : index
- // CHECK-DAG: %[[DIST_UNIT_1_SUBRANGE_START:.*]] = arith.remui %[[DIST_UNIT_1_OFFSET]], %[[SG_LEVEL_VECSIZE]] : index
- // CHECK-DAG: %[[V6:.*]] = arith.addi %[[DIST_UNIT_0_SUBRANGE_START]], %[[C1]] : index
- // CHECK-DAG: %[[V7:.*]] = arith.addi %[[DIST_UNIT_1_SUBRANGE_START]], %[[C1]] : index
- // CHECK-DAG: %[[VEC:.*]] = vector.from_elements
- // CHECK-SAME: %[[DIST_UNIT_0_SUBRANGE_START]], %[[V6]],
- // CHECK-SAME: %[[DIST_UNIT_1_SUBRANGE_START]], %[[V7]]
- // CHECK-SAME: : vector<4xindex>
- // CHECK-NEXT: "some_use"(%[[VEC]]) : (vector<4xindex>) -> ()
- gpu.func @vector_step_slice_multi_dist_unit(%arg0: index) {
- %0 = gpu.warp_execute_on_lane_0(%arg0)[4] -> (vector<4xindex>) {
- %5 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [2, 4, 2], lane_data = [1,2,1]>, dims = [0, 2]>} : vector<16xindex>
- gpu.yield %5 : vector<16xindex>
- }
- "some_use"(%0) : (vector<4xindex>) -> ()
- gpu.return
- }
-
- // CHECK-LABEL: gpu.func @convert_layout_removed_when_compatible(
- // CHECK: %[[R:.*]] = gpu.warp_execute_on_lane_0
- // CHECK-NOT: xegpu.convert_layout
- // CHECK: gpu.yield %{{.*}} : vector<16xf32>
- gpu.func @convert_layout_removed_when_compatible(%laneid: index){
- %r:2 = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>, vector<1xf32>) {
- %0 = "some_op"() : () -> vector<16xf32>
- %2 = "some_op"() : () -> vector<1xf32>
- %1 = xegpu.convert_layout %0
- <{input_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
- target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}>
- : vector<16xf32>
- %3 = xegpu.convert_layout %2
- <{input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
- target_layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}>
- : vector<1xf32>
- %4 = xegpu.convert_layout %3
- <{input_layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>,
- target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 16], lane_data = [1, 1, 1]>, dims = [0, 1]>}>
- : vector<1xf32>
- gpu.yield %1, %4 : vector<16xf32>, vector<1xf32>
- }
- "some_user_op"(%r#0, %r#1) : (vector<1xf32>, vector<1xf32>) -> ()
- gpu.return
- }
-
- // CHECK-NOT: xegpu.convert_layout
- // CHECK: gpu.yield %{{.*}} : f32
- gpu.func @convert_layout_scalar(%laneid: index){
- %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (f32) {
- %0 = "some_op"() : () -> f32
- %1 = xegpu.convert_layout %0
- <{input_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>,
- target_layout = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>}>
- : f32
- gpu.yield %1 : f32
- }
- "some_user_op"(%r) : (f32) -> ()
- gpu.return
- }
-}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
deleted file mode 100644
index 285669cae7174..0000000000000
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ /dev/null
@@ -1,439 +0,0 @@
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \
-// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: gpu.func @load_dpas_postop_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>,
-// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T6:.*]] = math.exp %[[T5]] : vector<8x1xf32>
-// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
- gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
- -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
- !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-
- %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- %3 = xegpu.load_nd %2[%c0, %c0]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
- -> vector<16x16xf16>
-
- %4 = xegpu.dpas %1, %3
- {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-
- %5 = math.exp %4
- {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<8x16xf32>
-
- %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %5, %6[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @gemm
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>,
-// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
-// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
-// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
-// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]])
-// CHECK-SAME: -> (vector<8x1xf32>) {
-// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]]
-// CHECK-SAME: : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
-// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
-gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
- %c0 = arith.constant 0 : index
- %c16 = arith.constant 16 : index
- %c8 = arith.constant 8 : index
- %c1024 = arith.constant 1024 : index
- %block_id_x = gpu.block_id x
- %block_id_y = gpu.block_id y
- %0 = arith.muli %block_id_x, %c8 : index
- %1 = arith.muli %block_id_y, %c16 : index
- %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %3 = xegpu.load_nd %2[%0, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
-
- %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
-
- %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
- -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-
- %7 = xegpu.load_nd %5[%0, %arg3]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
- %8 = xegpu.load_nd %6[%arg3, %1]
- {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
-
- %9 = xegpu.dpas %7, %8, %arg4
- {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
-
- scf.yield %9 : vector<8x16xf32>
- } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-
- xegpu.store_nd %4, %2[%0, %1] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
-}
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_yield
-// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
-// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
-// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
-// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16>
-// CHECK-NEXT: } else {
-// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
- gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
- %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
- %loaded = scf.if %pred -> (vector<16x8xf16>) {
- %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
- } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- scf.yield %3 : vector<16x8xf16>
- } else {
- %3 = arith.constant {
- layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
- } dense<12.> : vector<16x8xf16>
- scf.yield %3 : vector<16x8xf16>
- } { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
- xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
-// CHECK: scf.if %[[PREDICATE]] {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
-// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-// CHECK-NEXT: }
-gpu.module @xevm_module{
- gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
- %pred = llvm.mlir.poison : i1
- %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
- %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
- scf.if %pred {
- %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
- layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
- } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
- xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
- }
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @mma_transpose_b(
-// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
-// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}>
-// CHECK-SAME: !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
-// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
-// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
-// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
-// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-gpu.module @xevm_module{
- gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
- %c0 = arith.constant 0 : index
- %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
- -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
- %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
- -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>>
- %3 = xegpu.load_nd %2[%c0, %c0] {layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>}
- : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1], order = [0, 1]>> -> vector<16x8xi32>
- %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2], order = [0, 1]>}
- : vector<16x8xi32> to vector<16x16xf16>
- %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
- : vector<16x16xf16> to vector<16x16xf16>
- %6 = xegpu.dpas %1, %5
- {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
- layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
- %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %6, %7[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>,
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
-
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
-// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
-// CHECK-SAME: memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
-// CHECK: gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
-// CHECK: }
-// CHECK: %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
-// CHECK-SAME: (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
-// CHECK: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
-// CHECK-SAME: args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
-// CHECK: gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
-// CHECK: }
-// CHECK: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
-// CHECK: }
-gpu.module @xevm_module{
- gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index,
- %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
- %arg2: memref<16x16xf32>) {
- %c128 = arith.constant 128 : index
- %c1 = arith.constant 1 : index
- %c0 = arith.constant 0 : index
- %ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<16x1xf32>)
- %ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : () -> (vector<16x16xf32>)
- %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) {
- %1 = "some_def"(%arg5)
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : (vector<16x1xf32>) -> (vector<16x1xf32>)
- %acc = "some_def"(%arg4, %1)
- {
- layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>)
- scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32>
- }
- {
- layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
- }
- xegpu.store_nd %3#0, %arg1[%c0, %c0] {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
- : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
-// CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[C8:.*]] = arith.constant 8 : index
-// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
-// CHECK: %[[REMU1:.*]] = arith.remui %[[LANE_ID]], %[[C8]]
-// CHECK: %[[DIVU:.*]] = arith.divui %[[LANE_ID]], %[[C8]]
-// CHECK: %[[REMU2:.*]] = arith.remui %[[DIVU]], %[[C2]]
-// CHECK: %[[REMU3:.*]] = arith.remui %[[REMU2]], %[[C2]]
-// CHECK: %[[REMU4:.*]] = arith.remui %[[REMU1]], %[[C8]]
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
-gpu.module @xevm_module{
- gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
- %c0 = arith.constant 0 : index
- %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
- xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
-// CHECK: %[[C8:.*]] = arith.constant 8 : index
-// CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[C4:.*]] = arith.constant 4 : index
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
-// CHECK: %[[REMU1:.*]] = arith.remui %[[LANE_ID]], %[[C4]]
-// CHECK: %[[DIVU:.*]] = arith.divui %[[LANE_ID]], %[[C4]]
-// CHECK: %[[REMU2:.*]] = arith.remui %[[DIVU]], %[[C4]]
-// CHECK: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C2]]
-// CHECK: %[[REMU3:.*]] = arith.remui %[[MUL]], %[[C8]]
-// CHECK: %[[REMU4:.*]] = arith.remui %[[REMU1]], %[[C4]]
-// CHECK: %[[ADD:.*]] = arith.addi %[[REMU4]], %[[C1]]
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
-gpu.module @xevm_module{
- gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
- xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) {
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
-// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
-// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index
-gpu.module @xevm_module{
- gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
- !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
- xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
- vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
-gpu.module @xevm_module{
- gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
- %c0 = arith.constant 0 : index
- %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
- %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
- %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
- // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
- %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
- xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case({{.*}}) {
-gpu.module @xevm_module{
- gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
- %c0 = arith.constant 0 : index
- %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
- %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
-
- %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
- %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
- // CHECK-NOT: vector.broadcast
- // CHECK-NOT: vector.shape_cast
-
- %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
- // CHECK-SAME: : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-
- xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector({{.*}}) {
-gpu.module @xevm_module{
- gpu.func @vector_shape_cast_scalar_to_vector(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
- %c0 = arith.constant 0 : index
- %9 = gpu.block_id x
- %10 = arith.index_cast %9 : index to i16
- %11 = arith.bitcast %10 : i16 to f16
- // CHECK: vector.broadcast {{.*}} : f16 to vector<16xf16>
- %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
- %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
- -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- gpu.return
- }
-}
-
-// -----
-gpu.module @xevm_test {
- // CHECK-LABEL: gpu.func @vector_reduce_2d
- // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
- // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
- // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
- // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32
- // CHECK-DAG: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
- // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
- // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
- // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
- // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32>
- // CHECK: %[[LOADED_REDUCED:.*]] = vector.reduction <add>, %[[LOADED]], %[[CST_1]] : vector<4xf32> into f32
- // CHECK: %[[SHUFFLE_0:.*]], %{{.*}} = gpu.shuffle xor %[[LOADED_REDUCED]], %[[C1]], %[[C16]] : f32
- // CHECK: %[[VEC_RED_0:.*]] = arith.addf %[[LOADED_REDUCED]], %[[SHUFFLE_0]] : f32
- // CHECK: %[[SHUFFLE_1:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_0]], %[[C2]], %[[C16]] : f32
- // CHECK: %[[VEC_RED_1:.*]] = arith.addf %[[VEC_RED_0]], %[[SHUFFLE_1]] : f32
- // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_1]], %[[C4]], %[[C16]] : f32
- // CHECK: %[[VEC_RED_2:.*]] = arith.addf %[[VEC_RED_1]], %[[SHUFFLE_2]] : f32
- // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32
- // CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32
- // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_3]] : f32 to vector<1xf32>
- // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
- gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
- %cst = arith.constant 1.000000e+00 : f32
- %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
- %2 = vector.broadcast %cst : f32 to vector<16xf32>
- %3 = vector.multi_reduction <add>, %1, %2 [0] : vector<4x16xf32> to vector<16xf32>
- %4 = vector.reduction <add>, %3 : vector<16xf32> into f32
- %40 = xegpu.convert_layout %4 <{input_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims = [0]>, target_layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>, dims = [0]>}>: f32
- %5 = vector.broadcast %40 : f32 to vector<16xf32>
- %cst_0 = arith.constant dense<0> : vector<16xindex>
- %cst_1 = arith.constant dense<true> : vector<16xi1>
- xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
- gpu.return
- }
-}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir b/mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir
deleted file mode 100644
index c3fdd9c90ffd5..0000000000000
--- a/mlir/test/Dialect/XeGPU/xegpu-subgroup-distribute-no-arch.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: mlir-opt --xegpu-subgroup-distribute -split-input-file %s | FileCheck %s
-// Regression test for https://github.com/llvm/llvm-project/issues/181531:
-// Running --xegpu-subgroup-distribute without a chip target attribute used to
-// call llvm_unreachable in getUArch(). The pass should now bail out gracefully.
-
-// CHECK-LABEL: gpu.func @no_crash_without_chip_attr
-// CHECK: gpu.return
-gpu.module @test_module {
- gpu.func @no_crash_without_chip_attr(%arg0: memref<8x16xf16>, %arg1: memref<8x16xf16>) {
- gpu.return
- }
-}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 581072f5218cd..5c3721630837d 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -231,51 +231,20 @@ struct TestXeGPURecoverTemporaryLayouts
}
};
-struct TestXeGPUSGDistribute
- : public PassWrapper<TestXeGPUSGDistribute,
- OperationPass<gpu::GPUModuleOp>> {
- MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute)
-
- StringRef getArgument() const final { return "test-xegpu-sg-distribute"; }
-
- StringRef getDescription() const final {
- return "Test the implementation of XeGPU Subgroup Distribution";
- }
-
- void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
- registry.insert<arith::ArithDialect>();
- registry.insert<memref::MemRefDialect>();
- registry.insert<xegpu::XeGPUDialect>();
- registry.insert<vector::VectorDialect>();
- registry.insert<index::IndexDialect>();
- }
-
- TestXeGPUSGDistribute() = default;
- TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default;
-
- void runOnOperation() override {
- RewritePatternSet patterns(&getContext());
- xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
- (void)applyPatternsGreedily(getOperation(), std::move(patterns));
- }
-};
-
-/// This test pass is intended to test the subgroup to workitem distribution of
+/// This test pass is intended to test the subgroup to lane distribution of
/// xegpu/vector/arith operations in isolation, it does not handle any
/// structural ops like scf.for etc.
-struct TestXeGPUSgToWiDistributeExperimental
- : public PassWrapper<TestXeGPUSgToWiDistributeExperimental,
+struct TestXeGPUSgToLaneDistribute
+ : public PassWrapper<TestXeGPUSgToLaneDistribute,
OperationPass<gpu::GPUModuleOp>> {
- MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
- TestXeGPUSgToWiDistributeExperimental)
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSgToLaneDistribute)
StringRef getArgument() const final {
- return "test-xegpu-sg-to-wi-distribute-experimental";
+ return "test-xegpu-sg-to-lane-distribute";
}
StringRef getDescription() const final {
- return "Test the experimental implementation of XeGPU Subgroup to "
- "Work-item Distribution";
+ return "Test the implementation of XeGPU Subgroup to Lane Distribution";
}
void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
@@ -287,9 +256,8 @@ struct TestXeGPUSgToWiDistributeExperimental
registry.insert<gpu::GPUDialect>();
}
- TestXeGPUSgToWiDistributeExperimental() = default;
- TestXeGPUSgToWiDistributeExperimental(
- const TestXeGPUSgToWiDistributeExperimental &pass)
+ TestXeGPUSgToLaneDistribute() = default;
+ TestXeGPUSgToLaneDistribute(const TestXeGPUSgToLaneDistribute &pass)
: PassWrapper(pass) {}
void runOnOperation() override {
@@ -313,42 +281,12 @@ struct TestXeGPUSgToWiDistributeExperimental
ConversionTarget target(*ctx);
RewritePatternSet patterns(ctx);
- xegpu::populateXeGPUSgToWiDistributeTypeConversionAndLegality(
+ xegpu::populateXeGPUSgToLaneDistributeTypeConversionAndLegality(
typeConverter, patterns, target);
(void)applyPartialConversion(op, target, std::move(patterns));
}
};
-struct TestXeGPUMoveFuncBodyToWarpOp
- : public PassWrapper<TestXeGPUMoveFuncBodyToWarpOp,
- OperationPass<gpu::GPUModuleOp>> {
- MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUMoveFuncBodyToWarpOp)
-
- StringRef getArgument() const final {
- return "test-xegpu-move-func-to-warp-op";
- }
-
- StringRef getDescription() const final {
- return "Test the implementation of XeGPU move gpu function body to "
- "WarpExecuteOnLane0 op.";
- }
-
- void getDependentDialects(::mlir::DialectRegistry ®istry) const override {
- registry.insert<xegpu::XeGPUDialect>();
- registry.insert<gpu::GPUDialect>();
- }
-
- TestXeGPUMoveFuncBodyToWarpOp() = default;
- TestXeGPUMoveFuncBodyToWarpOp(const TestXeGPUMoveFuncBodyToWarpOp &pass) =
- default;
-
- void runOnOperation() override {
- RewritePatternSet patterns(&getContext());
- xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
- (void)applyPatternsGreedily(getOperation(), std::move(patterns));
- }
-};
-
struct TestXeGPUPropagateLayouts
: public PassWrapper<TestXeGPUPropagateLayouts,
OperationPass<gpu::GPUModuleOp>> {
@@ -516,9 +454,7 @@ void registerTestXeGPULowerings() {
PassRegistration<TestXeGPUUnrollingPatterns>();
PassRegistration<TestXeGPULayoutInterface>();
PassRegistration<TestXeGPURecoverTemporaryLayouts>();
- PassRegistration<TestXeGPUSGDistribute>();
- PassRegistration<TestXeGPUSgToWiDistributeExperimental>();
- PassRegistration<TestXeGPUMoveFuncBodyToWarpOp>();
+ PassRegistration<TestXeGPUSgToLaneDistribute>();
PassRegistration<TestXeGPUPropagateLayouts>();
PassRegistration<TestXeGPUResolveLayoutConflicts>();
PassRegistration<TestXeGPUArrayLengthOptimization>();
More information about the Mlir-commits
mailing list