[Mlir-commits] [mlir] [MLIR][XeGPU][Draft] Xegpu distribution patterns for load_nd, store_nd, and create_nd_tdesc. (PR #119783)
Charitha Saumya
llvmlistbot at llvm.org
Thu Dec 12 14:45:17 PST 2024
https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/119783
>From dab68412dc7b133baec0b2ec17428f3378c65cc8 Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov at intel.com>
Date: Mon, 9 Dec 2024 20:25:50 +0000
Subject: [PATCH 1/7] [MLIR] Create GPU utils library & move distribution utils
---
.../mlir/Conversion/GPUCommon/GPUCommonPass.h | 2 +-
.../mlir/Dialect/GPU/Transforms/Passes.h | 2 +-
.../Dialect/GPU/Utils/DistributionUtils.h | 57 +++++++
.../{Transforms/Utils.h => Utils/GPUUtils.h} | 0
mlir/lib/Dialect/GPU/CMakeLists.txt | 3 +-
.../GPU/Transforms/AsyncRegionRewriter.cpp | 2 +-
.../GPU/Transforms/KernelOutlining.cpp | 2 +-
.../GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
mlir/lib/Dialect/GPU/Utils/CMakeLists.txt | 14 ++
.../Dialect/GPU/Utils/DistributionUtils.cpp | 149 ++++++++++++++++++
.../GPU/{Transforms => Utils}/Utils.cpp | 2 +-
.../Dialect/Vector/Transforms/CMakeLists.txt | 1 +
.../Vector/Transforms/VectorDistribute.cpp | 139 +---------------
13 files changed, 230 insertions(+), 145 deletions(-)
create mode 100644 mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
rename mlir/include/mlir/Dialect/GPU/{Transforms/Utils.h => Utils/GPUUtils.h} (100%)
create mode 100644 mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
create mode 100644 mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
rename mlir/lib/Dialect/GPU/{Transforms => Utils}/Utils.cpp (96%)
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 5f40315a849094..094360e75ab617 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -8,7 +8,7 @@
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Types.h"
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
index 8eb711962583da..eb51d477e23f86 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
@@ -13,7 +13,7 @@
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
-#include "Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
new file mode 100644
index 00000000000000..6efd2326971982
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
@@ -0,0 +1,57 @@
+//===- VectorDistributionUtils.h - Distribution Utilities -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
+#define MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBITIONUTILS_H_
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/PatternMatch.h"
+
+#include <utility>
+
+namespace mlir {
+namespace gpu {
+/// Return a value yielded by `warpOp` which statifies the filter lamdba
+/// condition and is not dead.
+OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
+ const std::function<bool(Operation *)> &fn);
+
+/// Helper to create a new WarpExecuteOnLane0Op with different signature.
+WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
+ RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+ ValueRange newYieldedValues, TypeRange newReturnTypes);
+
+/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
+/// `indices` return the index of each new output.
+WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
+ RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+ ValueRange newYieldedValues, TypeRange newReturnTypes,
+ llvm::SmallVector<size_t> &indices);
+
+/// Helper to know if an op can be hoisted out of the region.
+bool canBeHoisted(Operation *op, function_ref<bool(Value)> definedOutside);
+
+/// Return a value yielded by `warpOp` which statifies the filter lamdba
+/// condition and is not dead.
+OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
+ const std::function<bool(Operation *)> &fn);
+
+/// Delinearize the given `laneId` into multiple dimensions, where each
+/// dimension's size is determined by `originalShape` and `distributedShape`
+/// together. This function expects the total numbers of threads needed for
+/// distribution is equal to `warpSize`. Returns true and updates
+/// `delinearizedIds` if so.
+bool delinearizeLaneId(OpBuilder &builder, Location loc,
+ ArrayRef<int64_t> originalShape,
+ ArrayRef<int64_t> distributedShape, int64_t warpSize,
+ Value laneId, SmallVectorImpl<Value> &delinearizedIds);
+
+} // namespace gpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
similarity index 100%
rename from mlir/include/mlir/Dialect/GPU/Transforms/Utils.h
rename to mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index a59645480aba21..1026e9b509332a 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -40,7 +40,6 @@ add_mlir_dialect_library(MLIRGPUTransforms
Transforms/ShuffleRewriter.cpp
Transforms/SPIRVAttachTarget.cpp
Transforms/SubgroupReduceLowering.cpp
- Transforms/Utils.cpp
OBJECT
@@ -59,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
MLIRDataLayoutInterfaces
MLIRExecutionEngineUtils
MLIRGPUDialect
+ MLIRGPUUtils
MLIRIR
MLIRIndexDialect
MLIRLLVMDialect
@@ -76,3 +76,4 @@ add_mlir_dialect_library(MLIRGPUTransforms
add_subdirectory(TransformOps)
add_subdirectory(Pipelines)
+add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index b2fa3a99c53fc3..41a5e39e55064e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -16,7 +16,7 @@
#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/PatternMatch.h"
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index ba0c80c50211e3..a6a36848b5635d 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -18,7 +18,7 @@
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 185f824351a230..43eff3eddcc491 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -13,7 +13,7 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Location.h"
diff --git a/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
new file mode 100644
index 00000000000000..69094c518a159e
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(MLIRGPUUtils
+ Utils.cpp
+ DistributionUtils.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
+
+ LINK_LIBS PUBLIC
+ MLIRArithDialect
+ MLIRAffineDialect
+ MLIRGPUDialect
+ MLIRSupport
+ MLIRIR
+ )
diff --git a/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
new file mode 100644
index 00000000000000..c6e8e03350bbce
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
@@ -0,0 +1,149 @@
+//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements distribution utility methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/IR/Value.h"
+
+#include <numeric>
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+WarpExecuteOnLane0Op mlir::gpu::moveRegionToNewWarpOpAndReplaceReturns(
+ RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+ ValueRange newYieldedValues, TypeRange newReturnTypes) {
+ // Create a new op before the existing one, with the extra operands.
+ OpBuilder::InsertionGuard g(rewriter);
+ rewriter.setInsertionPoint(warpOp);
+ auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
+ warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
+ warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
+
+ Region &opBody = warpOp.getBodyRegion();
+ Region &newOpBody = newWarpOp.getBodyRegion();
+ Block &newOpFirstBlock = newOpBody.front();
+ rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
+ rewriter.eraseBlock(&newOpFirstBlock);
+ assert(newWarpOp.getWarpRegion().hasOneBlock() &&
+ "expected WarpOp with single block");
+
+ auto yield =
+ cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
+
+ rewriter.modifyOpInPlace(
+ yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
+ return newWarpOp;
+}
+
+WarpExecuteOnLane0Op mlir::gpu::moveRegionToNewWarpOpAndAppendReturns(
+ RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
+ ValueRange newYieldedValues, TypeRange newReturnTypes,
+ llvm::SmallVector<size_t> &indices) {
+ SmallVector<Type> types(warpOp.getResultTypes().begin(),
+ warpOp.getResultTypes().end());
+ auto yield = cast<gpu::YieldOp>(
+ warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
+ yield.getOperands().end());
+ for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
+ if (yieldValues.insert(std::get<0>(newRet))) {
+ types.push_back(std::get<1>(newRet));
+ indices.push_back(yieldValues.size() - 1);
+ } else {
+ // If the value already exit the region don't create a new output.
+ for (auto [idx, yieldOperand] :
+ llvm::enumerate(yieldValues.getArrayRef())) {
+ if (yieldOperand == std::get<0>(newRet)) {
+ indices.push_back(idx);
+ break;
+ }
+ }
+ }
+ }
+ yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
+ WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
+ rewriter, warpOp, yieldValues.getArrayRef(), types);
+ rewriter.replaceOp(warpOp,
+ newWarpOp.getResults().take_front(warpOp.getNumResults()));
+ return newWarpOp;
+}
+
+bool mlir::gpu::canBeHoisted(Operation *op,
+ function_ref<bool(Value)> definedOutside) {
+ return llvm::all_of(op->getOperands(), definedOutside) &&
+ isMemoryEffectFree(op) && op->getNumRegions() == 0;
+}
+
+OpOperand *
+mlir::gpu::getWarpResult(WarpExecuteOnLane0Op warpOp,
+ const std::function<bool(Operation *)> &fn) {
+ auto yield = cast<gpu::YieldOp>(
+ warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ for (OpOperand &yieldOperand : yield->getOpOperands()) {
+ Value yieldValues = yieldOperand.get();
+ Operation *definedOp = yieldValues.getDefiningOp();
+ if (definedOp && fn(definedOp)) {
+ if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
+ return &yieldOperand;
+ }
+ }
+ return {};
+}
+
+bool mlir::gpu::delinearizeLaneId(OpBuilder &builder, Location loc,
+ ArrayRef<int64_t> originalShape,
+ ArrayRef<int64_t> distributedShape,
+ int64_t warpSize, Value laneId,
+ SmallVectorImpl<Value> &delinearizedIds) {
+ // If the original shape and the distributed shape is the same, we don't
+ // distribute at all--every thread is handling the whole. For such case, we
+ // should not rely on lane IDs later. So just return an empty lane ID vector.
+ if (originalShape == distributedShape) {
+ delinearizedIds.clear();
+ return true;
+ }
+
+ SmallVector<int64_t> sizes;
+ for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
+ if (large % small != 0)
+ return false;
+ sizes.push_back(large / small);
+ }
+ if (std::accumulate(sizes.begin(), sizes.end(), 1,
+ std::multiplies<int64_t>()) != warpSize)
+ return false;
+
+ AffineExpr s0, s1;
+ bindSymbols(builder.getContext(), s0, s1);
+
+ int64_t usedThreads = 1;
+
+ Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+ delinearizedIds.assign(sizes.size(), zero);
+
+ for (int i = sizes.size() - 1; i >= 0; --i) {
+ usedThreads *= sizes[i];
+ if (usedThreads == warpSize) {
+ // We've used up all available threads. Don't need to perform modulo
+ // anymore. And we can stop the calculation for further dimensions.
+ delinearizedIds[i] = laneId;
+ break;
+ }
+ delinearizedIds[i] =
+ affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
+ laneId = affine::makeComposedAffineApply(
+ builder, loc, s0.floorDiv(usedThreads), {laneId});
+ }
+ return true;
+}
diff --git a/mlir/lib/Dialect/GPU/Transforms/Utils.cpp b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
similarity index 96%
rename from mlir/lib/Dialect/GPU/Transforms/Utils.cpp
rename to mlir/lib/Dialect/GPU/Utils/Utils.cpp
index e91aa18128c7b9..1f09875b3e2732 100644
--- a/mlir/lib/Dialect/GPU/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/Utils.cpp
@@ -10,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "llvm/Support/ErrorHandling.h"
namespace mlir::gpu {
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index 9a3bd5d4593d63..8ca5cb6c6dfabc 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -38,6 +38,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
MLIRArithDialect
MLIRDialectUtils
MLIRGPUDialect
+ MLIRGPUUtils
MLIRIR
MLIRLinalgDialect
MLIRMemRefDialect
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 3e142598369951..d080b0b0bd44bd 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -9,6 +9,7 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -18,7 +19,6 @@
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Support/FormatVariadic.h"
-#include <numeric>
#include <utility>
using namespace mlir;
@@ -162,92 +162,6 @@ struct DistributedLoadStoreHelper {
} // namespace
-/// Helper to create a new WarpExecuteOnLane0Op with different signature.
-static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
- RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
- ValueRange newYieldedValues, TypeRange newReturnTypes) {
- // Create a new op before the existing one, with the extra operands.
- OpBuilder::InsertionGuard g(rewriter);
- rewriter.setInsertionPoint(warpOp);
- auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
- warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
- warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
-
- Region &opBody = warpOp.getBodyRegion();
- Region &newOpBody = newWarpOp.getBodyRegion();
- Block &newOpFirstBlock = newOpBody.front();
- rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
- rewriter.eraseBlock(&newOpFirstBlock);
- assert(newWarpOp.getWarpRegion().hasOneBlock() &&
- "expected WarpOp with single block");
-
- auto yield =
- cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
-
- rewriter.modifyOpInPlace(
- yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
- return newWarpOp;
-}
-
-/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
-/// `indices` return the index of each new output.
-static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
- RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
- ValueRange newYieldedValues, TypeRange newReturnTypes,
- llvm::SmallVector<size_t> &indices) {
- SmallVector<Type> types(warpOp.getResultTypes().begin(),
- warpOp.getResultTypes().end());
- auto yield = cast<gpu::YieldOp>(
- warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
- llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
- yield.getOperands().end());
- for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
- if (yieldValues.insert(std::get<0>(newRet))) {
- types.push_back(std::get<1>(newRet));
- indices.push_back(yieldValues.size() - 1);
- } else {
- // If the value already exit the region don't create a new output.
- for (auto [idx, yieldOperand] :
- llvm::enumerate(yieldValues.getArrayRef())) {
- if (yieldOperand == std::get<0>(newRet)) {
- indices.push_back(idx);
- break;
- }
- }
- }
- }
- yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
- WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
- rewriter, warpOp, yieldValues.getArrayRef(), types);
- rewriter.replaceOp(warpOp,
- newWarpOp.getResults().take_front(warpOp.getNumResults()));
- return newWarpOp;
-}
-
-/// Helper to know if an op can be hoisted out of the region.
-static bool canBeHoisted(Operation *op,
- function_ref<bool(Value)> definedOutside) {
- return llvm::all_of(op->getOperands(), definedOutside) &&
- isMemoryEffectFree(op) && op->getNumRegions() == 0;
-}
-
-/// Return a value yielded by `warpOp` which statifies the filter lamdba
-/// condition and is not dead.
-static OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
- const std::function<bool(Operation *)> &fn) {
- auto yield = cast<gpu::YieldOp>(
- warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
- for (OpOperand &yieldOperand : yield->getOpOperands()) {
- Value yieldValues = yieldOperand.get();
- Operation *definedOp = yieldValues.getDefiningOp();
- if (definedOp && fn(definedOp)) {
- if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
- return &yieldOperand;
- }
- }
- return {};
-}
-
// Clones `op` into a new operation that takes `operands` and returns
// `resultTypes`.
static Operation *cloneOpWithOperandsAndTypes(RewriterBase &rewriter,
@@ -770,57 +684,6 @@ struct WarpOpConstant : public OpRewritePattern<WarpExecuteOnLane0Op> {
}
};
-/// Delinearize the given `laneId` into multiple dimensions, where each
-/// dimension's size is determined by `originalShape` and `distributedShape`
-/// together. This function expects the total numbers of threads needed for
-/// distribution is equal to `warpSize`. Returns true and updates
-/// `delinearizedIds` if so.
-bool delinearizeLaneId(OpBuilder &builder, Location loc,
- ArrayRef<int64_t> originalShape,
- ArrayRef<int64_t> distributedShape, int64_t warpSize,
- Value laneId, SmallVectorImpl<Value> &delinearizedIds) {
- // If the original shape and the distributed shape is the same, we don't
- // distribute at all--every thread is handling the whole. For such case, we
- // should not rely on lane IDs later. So just return an empty lane ID vector.
- if (originalShape == distributedShape) {
- delinearizedIds.clear();
- return true;
- }
-
- SmallVector<int64_t> sizes;
- for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
- if (large % small != 0)
- return false;
- sizes.push_back(large / small);
- }
- if (std::accumulate(sizes.begin(), sizes.end(), 1,
- std::multiplies<int64_t>()) != warpSize)
- return false;
-
- AffineExpr s0, s1;
- bindSymbols(builder.getContext(), s0, s1);
-
- int64_t usedThreads = 1;
-
- Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
- delinearizedIds.assign(sizes.size(), zero);
-
- for (int i = sizes.size() - 1; i >= 0; --i) {
- usedThreads *= sizes[i];
- if (usedThreads == warpSize) {
- // We've used up all available threads. Don't need to perform modulo
- // anymore. And we can stop the calculation for further dimensions.
- delinearizedIds[i] = laneId;
- break;
- }
- delinearizedIds[i] =
- affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
- laneId = affine::makeComposedAffineApply(
- builder, loc, s0.floorDiv(usedThreads), {laneId});
- }
- return true;
-}
-
/// Sink out transfer_read op feeding into a warp op yield.
/// ```
/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
>From f6cd50a7028b806824194baf3cd231e5342f12dc Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 19:39:28 +0000
Subject: [PATCH 2/7] pass added
---
.../Dialect/XeGPU/Transforms/Transforms.h | 1 +
mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 10 +-
.../Dialect/XeGPU/Transforms/CMakeLists.txt | 5 +
.../XeGPU/Transforms/XeGPUDistribute.cpp | 393 ++++++++++++++++++
mlir/test/Dialect/XeGPU/xegpu-distribute.mlir | 81 ++++
mlir/test/lib/Dialect/CMakeLists.txt | 1 +
mlir/test/lib/Dialect/XeGPU/CMakeLists.txt | 16 +
.../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 58 +++
mlir/tools/mlir-opt/CMakeLists.txt | 1 +
mlir/tools/mlir-opt/mlir-opt.cpp | 2 +
10 files changed, 563 insertions(+), 5 deletions(-)
create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
create mode 100644 mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
create mode 100644 mlir/test/lib/Dialect/XeGPU/CMakeLists.txt
create mode 100644 mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 63ea26df069372..fe5198d1ac6dba 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -16,6 +16,7 @@ namespace xegpu {
/// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
+void populateXeGPUDistributePatterns(RewritePatternSet &patterns);
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 9d3c4366a7bd50..f0e67df3ee6069 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -281,11 +281,11 @@ LogicalResult LoadNdOp::verify() {
tdescShape.insert(it, array_len);
}
- if (tdescShape != valueShape)
- return emitOpError() << "Result shape doesn't match TensorDesc shape."
- << "The expected shape is " << makeString(tdescShape)
- << ". But the given shape is "
- << makeString(valueShape) << ".\n";
+ // if (tdescShape != valueShape)
+ // return emitOpError() << "Result shape doesn't match TensorDesc shape."
+ // << "The expected shape is " << makeString(tdescShape)
+ // << ". But the given shape is "
+ // << makeString(valueShape) << ".\n";
return success();
}
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 7fb64d3b97b87d..148ff46ba41b72 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
add_mlir_dialect_library(MLIRXeGPUTransforms
XeGPUFoldAliasOps.cpp
+ XeGPUDistribute.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
@@ -12,6 +13,10 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
MLIRIR
MLIRMemRefDialect
MLIRXeGPUDialect
+ MLIRVectorDialect
+ MLIRVectorUtils
+ MLIRArithDialect
+ MLIRFuncDialect
MLIRPass
MLIRTransforms
)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
new file mode 100644
index 00000000000000..52c953697d0f34
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -0,0 +1,393 @@
+//===- XeGPUDistribute.cpp - XeGPU ditribute ops to work items --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "xegpu-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
+
+// /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
+// /// The warp op will still contain the original op that will not be used by the
+// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// the create_nd_tdesc's arguments.
+// /// The rewrite will create a subview of the size used by a single work item and
+// /// appropriate offset. The distributed create_nd_tdesc points into the subview
+// /// without offset. The tensor descriptor types is distributed according to
+// /// sg_map attribute.
+// ///
+// /// Example:
+// ///
+// /// ```
+// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+// /// %r = vector.warp_execute_on_lane_0(%laneid) ->
+// /// (!xegpu.tensor_desc<4x8xf32>) {
+// /// ...
+// /// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+// /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+// /// vector.yield %td
+// /// }
+// /// ```
+// /// To
+// /// ```
+// /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+// /// ...
+// /// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+// /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+// /// vector.yield %arg0, %dead
+// /// }
+// /// %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
+// /// : memref<4x8xf32> to memref<4x1xf32>
+// /// %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
+// /// -> !xegpu.tensor_desc<4x1xf32>
+// ///
+// /// ```
+// struct WarpOpTensorDescOp final
+// : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+// using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+// LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+// PatternRewriter &rewriter) const override;
+// };
+
+// /// Sink a store_nd feeding into vector.yield op for the enclosing
+// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are passed
+// /// through the warp op interface they would be propagated as returned values.
+// /// Both the stored vector type and tensor descriptor types are distributed
+// /// according to sg_map attribute.
+// ///
+// /// Example:
+// ///
+// /// ```
+// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+// /// vector.warp_execute_on_lane_0(%laneid) -> () {
+// /// ...
+// /// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+// /// !xegpu.tensor_desc<4x8xf32>
+// /// vector.yield
+// /// }
+// /// ```
+// /// To
+// /// ```
+// /// %r = vector.warp_execute_on_lane_0(%laneid) -> () {
+// /// ...
+// /// vector.yield
+// /// }
+// /// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
+// ///
+// /// ```
+// struct WarpOpStoreNd final
+// : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+// using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+// LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+// PatternRewriter &rewriter) const override;
+// };
+
+// /// Clone a load_nd feeding into vector.yield op for the enclosing
+// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
+// /// The warp op will still contain the original op that will not be used by the
+// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// the load's arguments.
+// /// Both the loaded vector type and tensor descriptor types are distributed
+// /// according to sg_map attribute.
+// ///
+// /// Example:
+// ///
+// /// ```
+// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+// /// %r = vector.warp_execute_on_lane_0(%laneid) ->
+// /// (!xegpu.tensor_desc<4x8xf32>) {
+// /// ...
+// /// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
+// /// vector<4x8xf32> vector.yield %ld
+// /// }
+// /// ```
+// /// To
+// /// ```
+// /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+// /// ...
+// /// %dead = xegpu.load_nd %arg0, %arg1:
+// /// !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
+// /// vector.yield %arg0, %arg1
+// /// }
+// /// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
+// ///
+// /// ```
+// struct WarpOpLoadNd final
+// : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+// using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+// LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+// PatternRewriter &rewriter) const override;
+// };
+
+// FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
+// xegpu::SGMapAttr sgMap) {
+// llvm::SmallVector<int64_t, 2> distributedShape;
+// auto layout = sgMap.getWiLayout();
+// auto shape = originalT.getShape();
+// for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+// if (!divisible(APInt(64, o), APInt(64, l)))
+// return failure();
+// distributedShape.push_back(o / l);
+// }
+// auto newVectorType =
+// VectorType::get(distributedShape, originalT.getElementType(),
+// originalT.getScalableDims());
+// return newVectorType;
+// }
+
+// FailureOr<xegpu::TensorDescType>
+// getDistributedTensorDescType(xegpu::TensorDescType originalT,
+// xegpu::SGMapAttr sgMap,
+// xegpu::MemorySpace memSpace) {
+// llvm::SmallVector<int64_t, 2> distributedShape;
+// auto layout = sgMap.getWiLayout();
+// auto shape = originalT.getShape();
+// for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+// if (!divisible(APInt(64, o), APInt(64, l)))
+// return failure();
+// distributedShape.push_back(o / l);
+// }
+// xegpu::TensorDescType distributedDescType;
+// if (originalT.isScattered()) {
+
+// distributedDescType = xegpu::TensorDescType::get(
+// distributedShape, originalT.getElementType(), originalT.getChunkSize(),
+// originalT.getMemorySpace(), originalT.getSGMapAttr());
+// } else {
+// distributedDescType = xegpu::TensorDescType::get(
+// distributedShape, originalT.getElementType(),
+// originalT.getBoundaryCheck(), originalT.getArrayLength(),
+// originalT.getMemorySpace(), originalT.getSGMapAttr());
+// }
+// return distributedDescType;
+// }
+} // namespace
+
+// LogicalResult
+// WarpOpStoreNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+// PatternRewriter &rewriter) const {
+// auto yield = cast<vector::YieldOp>(
+// warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+// Operation *lastNode = yield->getPrevNode();
+// auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+// if (!storeOp)
+// return failure();
+
+// auto origType = storeOp.getTensorDescType();
+// xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+// if (!sgMap)
+// return rewriter.notifyMatchFailure(
+// storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+// if (storeOp.getTensorDescType().getShape().size() != 2)
+// return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+// DBGS() << "Matched store_nd: " << storeOp << "\n";
+
+// auto distributedTypeOrFailure =
+// getDistributedVectorType(storeOp.getValueType(), sgMap);
+// if (failed(distributedTypeOrFailure))
+// return rewriter.notifyMatchFailure(storeOp,
+// "Failed to distribute the type");
+// VectorType newVectorType = distributedTypeOrFailure.value();
+
+// auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+// storeOp.getTensorDescType(), sgMap,
+// storeOp.getTensorDescType().getMemorySpace());
+// if (failed(distributedDescTypeOrFailure))
+// return rewriter.notifyMatchFailure(storeOp,
+// "Failed to distribute the desc type");
+// xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+// SmallVector<size_t> newRetIndices;
+// vector::WarpExecuteOnLane0Op newWarpOp =
+// moveRegionToNewWarpOpAndAppendReturns(
+// rewriter, warpOp,
+// ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+// TypeRange{newTDescType, newVectorType}, newRetIndices);
+
+// rewriter.setInsertionPointAfter(newWarpOp);
+// auto newStoreOp =
+// cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+// rewriter.eraseOp(storeOp);
+// newStoreOp.getTensorDescMutable().assign(
+// newWarpOp.getResult(newRetIndices[0]));
+// newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+
+// return success();
+// }
+
+// LogicalResult WarpOpLoadNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+// PatternRewriter &rewriter) const {
+// OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+// return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
+// });
+
+// if (!operand)
+// return rewriter.notifyMatchFailure(warpOp,
+// "warp result is not a xegpu::LoadNd op");
+
+// auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+
+// if (loadOp.getPacked())
+// return rewriter.notifyMatchFailure(
+// loadOp, "Packed load distribution not supported");
+
+// xegpu::TensorDescType origType = loadOp.getTensorDescType();
+// xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+// if (!sgMap)
+// return rewriter.notifyMatchFailure(
+// loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+// auto origShape = origType.getShape();
+// if (origShape.size() != 2)
+// return rewriter.notifyMatchFailure(loadOp, "unsupported shape");
+
+// auto distributedTypeOrFailure =
+// getDistributedVectorType(loadOp.getType(), sgMap);
+// if (failed(distributedTypeOrFailure))
+// return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
+// VectorType newVectorType = distributedTypeOrFailure.value();
+
+// auto distributedDescTypeOrFailure =
+// getDistributedTensorDescType(loadOp.getTensorDescType(), sgMap,
+// loadOp.getTensorDescType().getMemorySpace());
+// if (failed(distributedDescTypeOrFailure))
+// return rewriter.notifyMatchFailure(loadOp,
+// "Failed to distribute the desc type");
+// xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+// unsigned operandIdx = operand->getOperandNumber();
+
+// SmallVector<size_t> newRetIndices;
+// vector::WarpExecuteOnLane0Op newWarpOp =
+// moveRegionToNewWarpOpAndAppendReturns(
+// rewriter, warpOp, loadOp.getTensorDesc(), TypeRange{newTDescType},
+// newRetIndices);
+
+// rewriter.setInsertionPointAfter(newWarpOp);
+
+// auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+// loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
+// loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
+// loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+
+// newLoadOp.getTensorDescMutable().assign(
+// newWarpOp.getResult(newRetIndices[0]));
+// Value distributedVal = newWarpOp.getResult(operandIdx);
+// rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+
+// return success();
+// }
+
+// LogicalResult
+// WarpOpTensorDescOp::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+// PatternRewriter &rewriter) const {
+// OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+// return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
+// });
+
+// if (!operand)
+// return rewriter.notifyMatchFailure(
+// warpOp, "warp result is not a xegpu::CreateNdDesc op");
+// auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+// assert(descOp && "desc op must be not null");
+// unsigned operandIdx = operand->getOperandNumber();
+
+// // TODO: is memref uniform in the region
+// rewriter.setInsertionPoint(warpOp);
+// auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+// assert(srcTypedVal && "source value must be not null");
+
+// auto descOffsets = descOp.getMixedOffsets();
+// if (descOffsets.size() != 2)
+// return rewriter.notifyMatchFailure(descOp,
+// "offsets size is expected to be 2");
+
+// xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+// if (!sgMap)
+// return rewriter.notifyMatchFailure(
+// descOp, "the tensor descriptor lacks sg_map attribute");
+
+// auto layout = sgMap.getWiLayout();
+
+// // Calculate the offset within tensor descriptor for the current lane_id. The
+// // access to proper element for a work item is done through a lane-specific
+// // subview (tdesc offsets are used as base, lane shift is added on top).
+// auto laneid = warpOp.getLaneid();
+// auto xDim =
+// rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
+// auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
+// auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
+
+// auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+// descOffsets[0]);
+// auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+// descOffsets[1]);
+// auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
+// auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
+
+// auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+// descOp.getType(), sgMap, descOp.getType().getMemorySpace());
+// if (failed(distributedDescTypeOrFailure))
+// return rewriter.notifyMatchFailure(descOp,
+// "Failed to distribute the desc type");
+// xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+// auto distributedShape = newTDescType.getShape();
+// // use the base memref strides
+// SmallVector<OpFoldResult> overwriteStrides =
+// getAsIndexOpFoldResult(rewriter.getContext(), SmallVector<int64_t>{1, 1});
+// SmallVector<OpFoldResult> overwriteSizes =
+// getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
+
+// SmallVector<size_t> newRetIndices;
+// vector::WarpExecuteOnLane0Op newWarpOp =
+// moveRegionToNewWarpOpAndAppendReturns(
+// rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
+// newRetIndices);
+
+// rewriter.setInsertionPointAfter(newWarpOp);
+// auto subview = rewriter.create<memref::SubViewOp>(
+// newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
+// overwriteSizes, overwriteStrides);
+// subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
+
+// auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
+// auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+// newWarpOp.getLoc(), newTDescType, subview,
+// getAsOpFoldResult({zero, zero}));
+
+// Value distributedVal = newWarpOp.getResult(operandIdx);
+// rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+
+// return success();
+// }
+
+void xegpu::populateXeGPUDistributePatterns(RewritePatternSet &patterns) {
+ // patterns.add<WarpOpTensorDescOp>(patterns.getContext());
+ // patterns.add<WarpOpStoreNd>(patterns.getContext());
+ // patterns.add<WarpOpLoadNd>(patterns.getContext());
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
new file mode 100644
index 00000000000000..f9efda80ab6468
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
@@ -0,0 +1,81 @@
+// RUN: mlir-opt -test-xegpu-distribute -split-input-file %s | FileCheck %s
+
+#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
+
+// CHECK-LABEL: test_store_nd_distribution
+// CHECK: %[[laneid:.*]] = gpu.lane_id
+// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}}, %{{.*}} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x2xf16>)
+// CHECK: ^bb0(%[[src:.*]]: vector<24x32xf16>, %[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK: gpu.yield%[[dst]], %[[src]] : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x32xf16>
+// CHECK: xegpu.store_nd %[[res]]#1, %[[res]]#0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> :
+// CHECK-SAME: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+
+func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> () {
+ %laneid = gpu.lane_id
+ gpu.warp_execute_on_lane_0(%laneid)[16]
+ args(%src, %dst: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+ ^bb0(%arg0: vector<24x32xf16>, %arg1: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>):
+ xegpu.store_nd %arg0, %arg1 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+ }
+ return
+}
+
+// -----
+
+#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
+
+// CHECK-LABEL: test_load_nd_distribution
+// CHECK: %[[laneid:.*]] = gpu.lane_id
+// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}} : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK-SAME: -> (vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK: ^bb0(%[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
+// CHECK: %[[dead:.*]] = xegpu.load_nd
+// CHECK: gpu.yield%[[dead]], %[[dst]] : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: %[[load:.*]] = xegpu.load_nd %[[res]]#1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
+// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<24x2xf16>
+// CHECK: return %[[load]]
+
+func.func @test_load_nd_distribution(%dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> (vector<24x2xf16>) {
+ %laneid = gpu.lane_id
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16]
+ args(%dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> (vector<24x2xf16>) {
+ ^bb0(%arg0: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>):
+ %0 = xegpu.load_nd %arg0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+ : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16> -> vector<24x32xf16>
+ gpu.yield%0 : vector<24x32xf16>
+ }
+ return %r : vector<24x2xf16>
+}
+
+// -----
+
+#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
+#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
+
+// CHECK-LABEL: test_create_nd_desc_distribution
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[laneid:.*]] = gpu.lane_id
+// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}} : memref<24x32xf16>)
+// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>)
+// CHECK: ^bb0(%[[dst:.*]]: memref<24x32xf16>)
+// CHECK: %[[dead:.*]] = xegpu.create_nd_tdesc
+// CHECK: gpu.yield%[[dead]], %[[dst]] :
+// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>
+// CHECK: %[[view:.*]] = memref.subview %[[res]]#1[%[[C0]], %[[laneid]]] [24, 2] [1, 1] : memref<24x32xf16> to memref<24x2xf16, strided<[32, 1], offset: ?>>
+// CHECK: %[[desc:.*]] = xegpu.create_nd_tdesc %[[view]][0, 0] : memref<24x2xf16, strided<[32, 1], offset: ?>>
+// CHECK-SAME: -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: return %[[desc]]
+
+func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+ %laneid = gpu.lane_id
+ %r = gpu.warp_execute_on_lane_0(%laneid)[16]
+ args(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+ ^bb0(%arg0: memref<24x32xf16>):
+ %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+ gpu.yield%0 : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+ }
+ return %r : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+}
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt
index 29fb4441a24fd2..a8fd70e6397a52 100644
--- a/mlir/test/lib/Dialect/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/CMakeLists.txt
@@ -22,3 +22,4 @@ add_subdirectory(TestDyn)
add_subdirectory(Tosa)
add_subdirectory(Transform)
add_subdirectory(Vector)
+add_subdirectory(XeGPU)
diff --git a/mlir/test/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/test/lib/Dialect/XeGPU/CMakeLists.txt
new file mode 100644
index 00000000000000..d67439a8b6127e
--- /dev/null
+++ b/mlir/test/lib/Dialect/XeGPU/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Exclude tests from libMLIR.so
+add_mlir_library(MLIRXeGPUTestPasses
+ TestXeGPUTransforms.cpp
+
+ EXCLUDE_FROM_LIBMLIR
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
+
+ LINK_LIBS PUBLIC
+ MLIRIR
+ MLIRPass
+ MLIRXeGPUTransforms
+ MLIRXeGPUDialect
+ MLIRSupport
+ )
\ No newline at end of file
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
new file mode 100644
index 00000000000000..a5060c8eeb916e
--- /dev/null
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -0,0 +1,58 @@
+//===- TestXeGPUTransforms.cpp - Test XeGPU transforms and lowerings ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::xegpu;
+using namespace mlir::vector;
+
+namespace {
+struct TestXeGPUDistribution
+ : public PassWrapper<TestXeGPUDistribution, OperationPass<func::FuncOp>> {
+ MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUDistribution)
+
+ TestXeGPUDistribution() = default;
+ TestXeGPUDistribution(const TestXeGPUDistribution &pass)
+ : PassWrapper(pass) {}
+
+ StringRef getArgument() const final { return "test-xegpu-distribute"; }
+ StringRef getDescription() const final {
+ return "Test patterns for operations work item distribution";
+ }
+
+ void getDependentDialects(DialectRegistry ®istry) const override {
+ registry.insert<xegpu::XeGPUDialect>();
+ registry.insert<vector::VectorDialect>();
+ registry.insert<arith::ArithDialect>();
+ registry.insert<memref::MemRefDialect>();
+ }
+
+ void runOnOperation() override {
+ RewritePatternSet patterns(&getContext());
+ populateXeGPUDistributePatterns(patterns);
+ (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+ }
+};
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestXeGPUTransforms() {
+ PassRegistration<TestXeGPUDistribution>();
+}
+} // namespace test
+} // namespace mlir
\ No newline at end of file
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 8b79de58fa1028..e4ffbbee7a1d94 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -47,6 +47,7 @@ if(MLIR_INCLUDE_TESTS)
MLIRTilingInterfaceTestPasses
MLIRVectorTestPasses
MLIRTestVectorToSPIRV
+ MLIRXeGPUTestPasses
MLIRLLVMTestPasses
)
set(test_libs ${test_libs}
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 960f7037a1b61f..c89ff9964efa03 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -153,6 +153,7 @@ void registerTestTransformDialectEraseSchedulePass();
void registerTestPassStateExtensionCommunication();
void registerTestVectorLowerings();
void registerTestVectorReductionToSPIRVDotProd();
+void registerTestXeGPUTransforms();
void registerTestVulkanRunnerPipeline();
void registerTestWrittenToPass();
#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
@@ -291,6 +292,7 @@ void registerTestPasses() {
mlir::test::registerTestTransformDialectEraseSchedulePass();
mlir::test::registerTestPassStateExtensionCommunication();
mlir::test::registerTestVectorLowerings();
+ mlir::test::registerTestXeGPUTransforms();
mlir::test::registerTestVectorReductionToSPIRVDotProd();
mlir::test::registerTestVulkanRunnerPipeline();
mlir::test::registerTestWrittenToPass();
>From 1c0692085ae2518768132e53e6436bf443e5e0a4 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 19:47:16 +0000
Subject: [PATCH 3/7] fix
---
.../XeGPU/Transforms/XeGPUDistribute.cpp | 231 +++++++++---------
1 file changed, 116 insertions(+), 115 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 52c953697d0f34..5c45a293a8d468 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -12,6 +12,7 @@
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
@@ -66,12 +67,12 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
// /// -> !xegpu.tensor_desc<4x1xf32>
// ///
// /// ```
-// struct WarpOpTensorDescOp final
-// : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-// using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-// LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-// PatternRewriter &rewriter) const override;
-// };
+struct WarpOpTensorDescOp final
+ : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+ using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override;
+};
// /// Sink a store_nd feeding into vector.yield op for the enclosing
// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are passed
@@ -159,32 +160,32 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
// return newVectorType;
// }
-// FailureOr<xegpu::TensorDescType>
-// getDistributedTensorDescType(xegpu::TensorDescType originalT,
-// xegpu::SGMapAttr sgMap,
-// xegpu::MemorySpace memSpace) {
-// llvm::SmallVector<int64_t, 2> distributedShape;
-// auto layout = sgMap.getWiLayout();
-// auto shape = originalT.getShape();
-// for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
-// if (!divisible(APInt(64, o), APInt(64, l)))
-// return failure();
-// distributedShape.push_back(o / l);
-// }
-// xegpu::TensorDescType distributedDescType;
-// if (originalT.isScattered()) {
-
-// distributedDescType = xegpu::TensorDescType::get(
-// distributedShape, originalT.getElementType(), originalT.getChunkSize(),
-// originalT.getMemorySpace(), originalT.getSGMapAttr());
-// } else {
-// distributedDescType = xegpu::TensorDescType::get(
-// distributedShape, originalT.getElementType(),
-// originalT.getBoundaryCheck(), originalT.getArrayLength(),
-// originalT.getMemorySpace(), originalT.getSGMapAttr());
-// }
-// return distributedDescType;
-// }
+FailureOr<xegpu::TensorDescType>
+getDistributedTensorDescType(xegpu::TensorDescType originalT,
+ xegpu::SGMapAttr sgMap,
+ xegpu::MemorySpace memSpace) {
+ llvm::SmallVector<int64_t, 2> distributedShape;
+ auto layout = sgMap.getWiLayout();
+ auto shape = originalT.getShape();
+ for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+ if (!divisible(APInt(64, o), APInt(64, l)))
+ return failure();
+ distributedShape.push_back(o / l);
+ }
+ xegpu::TensorDescType distributedDescType;
+ if (originalT.isScattered()) {
+
+ distributedDescType = xegpu::TensorDescType::get(
+ distributedShape, originalT.getElementType(), originalT.getChunkSize(),
+ originalT.getMemorySpace(), originalT.getSGMapAttr());
+ } else {
+ distributedDescType = xegpu::TensorDescType::get(
+ distributedShape, originalT.getElementType(),
+ originalT.getBoundaryCheck(), originalT.getArrayLength(),
+ originalT.getMemorySpace(), originalT.getSGMapAttr());
+ }
+ return distributedDescType;
+}
} // namespace
// LogicalResult
@@ -303,91 +304,91 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
// return success();
// }
-// LogicalResult
-// WarpOpTensorDescOp::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-// PatternRewriter &rewriter) const {
-// OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
-// return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
-// });
-
-// if (!operand)
-// return rewriter.notifyMatchFailure(
-// warpOp, "warp result is not a xegpu::CreateNdDesc op");
-// auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
-// assert(descOp && "desc op must be not null");
-// unsigned operandIdx = operand->getOperandNumber();
-
-// // TODO: is memref uniform in the region
-// rewriter.setInsertionPoint(warpOp);
-// auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
-// assert(srcTypedVal && "source value must be not null");
-
-// auto descOffsets = descOp.getMixedOffsets();
-// if (descOffsets.size() != 2)
-// return rewriter.notifyMatchFailure(descOp,
-// "offsets size is expected to be 2");
-
-// xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
-// if (!sgMap)
-// return rewriter.notifyMatchFailure(
-// descOp, "the tensor descriptor lacks sg_map attribute");
-
-// auto layout = sgMap.getWiLayout();
-
-// // Calculate the offset within tensor descriptor for the current lane_id. The
-// // access to proper element for a work item is done through a lane-specific
-// // subview (tdesc offsets are used as base, lane shift is added on top).
-// auto laneid = warpOp.getLaneid();
-// auto xDim =
-// rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
-// auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
-// auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
-
-// auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
-// descOffsets[0]);
-// auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
-// descOffsets[1]);
-// auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
-// auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
-
-// auto distributedDescTypeOrFailure = getDistributedTensorDescType(
-// descOp.getType(), sgMap, descOp.getType().getMemorySpace());
-// if (failed(distributedDescTypeOrFailure))
-// return rewriter.notifyMatchFailure(descOp,
-// "Failed to distribute the desc type");
-// xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
-// auto distributedShape = newTDescType.getShape();
-// // use the base memref strides
-// SmallVector<OpFoldResult> overwriteStrides =
-// getAsIndexOpFoldResult(rewriter.getContext(), SmallVector<int64_t>{1, 1});
-// SmallVector<OpFoldResult> overwriteSizes =
-// getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
-
-// SmallVector<size_t> newRetIndices;
-// vector::WarpExecuteOnLane0Op newWarpOp =
-// moveRegionToNewWarpOpAndAppendReturns(
-// rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
-// newRetIndices);
-
-// rewriter.setInsertionPointAfter(newWarpOp);
-// auto subview = rewriter.create<memref::SubViewOp>(
-// newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
-// overwriteSizes, overwriteStrides);
-// subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
-
-// auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
-// auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
-// newWarpOp.getLoc(), newTDescType, subview,
-// getAsOpFoldResult({zero, zero}));
-
-// Value distributedVal = newWarpOp.getResult(operandIdx);
-// rewriter.replaceAllUsesWith(distributedVal, newDescOp);
-
-// return success();
-// }
+LogicalResult
+WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const {
+ OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+ return isa<xegpu::CreateNdDescOp>(op) && op->hasOneUse();
+ });
+
+ if (!operand)
+ return rewriter.notifyMatchFailure(
+ warpOp, "warp result is not a xegpu::CreateNdDesc op");
+ auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+ assert(descOp && "desc op must be not null");
+ unsigned operandIdx = operand->getOperandNumber();
+
+ // TODO: is memref uniform in the region
+ rewriter.setInsertionPoint(warpOp);
+ auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+ assert(srcTypedVal && "source value must be not null");
+
+ auto descOffsets = descOp.getMixedOffsets();
+ if (descOffsets.size() != 2)
+ return rewriter.notifyMatchFailure(descOp,
+ "offsets size is expected to be 2");
+
+ xegpu::SGMapAttr sgMap = descOp.getType().getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ descOp, "the tensor descriptor lacks sg_map attribute");
+
+ auto layout = sgMap.getWiLayout();
+
+ // Calculate the offset within tensor descriptor for the current lane_id. The
+ // access to proper element for a work item is done through a lane-specific
+ // subview (tdesc offsets are used as base, lane shift is added on top).
+ auto laneid = warpOp.getLaneid();
+ auto xDim =
+ rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
+ auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
+ auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
+
+ auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+ descOffsets[0]);
+ auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
+ descOffsets[1]);
+ auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
+ auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
+
+ auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+ descOp.getType(), sgMap, descOp.getType().getMemorySpace());
+ if (failed(distributedDescTypeOrFailure))
+ return rewriter.notifyMatchFailure(descOp,
+ "Failed to distribute the desc type");
+ xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+ auto distributedShape = newTDescType.getShape();
+ // use the base memref strides
+ SmallVector<OpFoldResult> overwriteStrides =
+ getAsIndexOpFoldResult(rewriter.getContext(), SmallVector<int64_t>{1, 1});
+ SmallVector<OpFoldResult> overwriteSizes =
+ getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp =
+ moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
+ newRetIndices);
+
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto subview = rewriter.create<memref::SubViewOp>(
+ newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
+ overwriteSizes, overwriteStrides);
+ subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
+
+ auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
+ auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+ newWarpOp.getLoc(), newTDescType, subview,
+ getAsOpFoldResult({zero, zero}));
+
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+
+ return success();
+}
void xegpu::populateXeGPUDistributePatterns(RewritePatternSet &patterns) {
- // patterns.add<WarpOpTensorDescOp>(patterns.getContext());
+ patterns.add<WarpOpTensorDescOp>(patterns.getContext());
// patterns.add<WarpOpStoreNd>(patterns.getContext());
// patterns.add<WarpOpLoadNd>(patterns.getContext());
}
>From 9888c849327aa0caef34325c0a8333851503714d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 21:15:39 +0000
Subject: [PATCH 4/7] fix
---
mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 3 +--
mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp | 6 +++++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5910aa3f7f2dae..f3ffbd0f5a027d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -327,8 +327,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
let hasVerifier = 1;
}
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>,
- AllElementTypesMatch<["value", "TensorDesc"]>]> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
let summary = "stores a n-D block register region back to memory, currently only supports 2D";
let description = [{
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 5c45a293a8d468..ac02be78280218 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -170,7 +170,11 @@ getDistributedTensorDescType(xegpu::TensorDescType originalT,
for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
if (!divisible(APInt(64, o), APInt(64, l)))
return failure();
- distributedShape.push_back(o / l);
+ // Tensor descriptor is distributed only for the scattered case.
+ if (originalT.isScattered())
+ distributedShape.push_back(o / l);
+ else
+ distributedShape.push_back(o);
}
xegpu::TensorDescType distributedDescType;
if (originalT.isScattered()) {
>From 491625d81d80cb0092ec5704f0de23c039e9c681 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 21:34:10 +0000
Subject: [PATCH 5/7] fix
---
.../XeGPU/Transforms/XeGPUDistribute.cpp | 319 +++++++++---------
1 file changed, 161 insertions(+), 158 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index ac02be78280218..95610a9af7ac85 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -33,11 +33,15 @@ bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
// /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by the
-// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// The warp op will still contain the original op that will not be used by
+// the
+// /// yield op (and should be cleaned up later with dce). The yield op will
+// bypass
// /// the create_nd_tdesc's arguments.
-// /// The rewrite will create a subview of the size used by a single work item and
-// /// appropriate offset. The distributed create_nd_tdesc points into the subview
+// /// The rewrite will create a subview of the size used by a single work item
+// and
+// /// appropriate offset. The distributed create_nd_tdesc points into the
+// subview
// /// without offset. The tensor descriptor types is distributed according to
// /// sg_map attribute.
// ///
@@ -75,8 +79,10 @@ struct WarpOpTensorDescOp final
};
// /// Sink a store_nd feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are passed
-// /// through the warp op interface they would be propagated as returned values.
+// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are
+// passed
+// /// through the warp op interface they would be propagated as returned
+// values.
// /// Both the stored vector type and tensor descriptor types are distributed
// /// according to sg_map attribute.
// ///
@@ -97,20 +103,23 @@ struct WarpOpTensorDescOp final
// /// ...
// /// vector.yield
// /// }
-// /// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
+// /// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
+// !xegpu.tensor_desc<4x1xf32>
// ///
// /// ```
-// struct WarpOpStoreNd final
-// : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-// using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-// LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-// PatternRewriter &rewriter) const override;
-// };
+struct WarpOpStoreNd final
+ : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+ using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override;
+};
// /// Clone a load_nd feeding into vector.yield op for the enclosing
// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by the
-// /// yield op (and should be cleaned up later with dce). The yield op will bypass
+// /// The warp op will still contain the original op that will not be used by
+// the
+// /// yield op (and should be cleaned up later with dce). The yield op will
+// bypass
// /// the load's arguments.
// /// Both the loaded vector type and tensor descriptor types are distributed
// /// according to sg_map attribute.
@@ -137,28 +146,27 @@ struct WarpOpTensorDescOp final
// /// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
// ///
// /// ```
-// struct WarpOpLoadNd final
-// : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
-// using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
-// LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-// PatternRewriter &rewriter) const override;
-// };
-
-// FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
-// xegpu::SGMapAttr sgMap) {
-// llvm::SmallVector<int64_t, 2> distributedShape;
-// auto layout = sgMap.getWiLayout();
-// auto shape = originalT.getShape();
-// for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
-// if (!divisible(APInt(64, o), APInt(64, l)))
-// return failure();
-// distributedShape.push_back(o / l);
-// }
-// auto newVectorType =
-// VectorType::get(distributedShape, originalT.getElementType(),
-// originalT.getScalableDims());
-// return newVectorType;
-// }
+struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
+ using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override;
+};
+
+FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
+ xegpu::SGMapAttr sgMap) {
+ llvm::SmallVector<int64_t, 2> distributedShape;
+ auto layout = sgMap.getWiLayout();
+ auto shape = originalT.getShape();
+ for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+ if (!divisible(APInt(64, o), APInt(64, l)))
+ return failure();
+ distributedShape.push_back(o / l);
+ }
+ auto newVectorType =
+ VectorType::get(distributedShape, originalT.getElementType(),
+ originalT.getScalableDims());
+ return newVectorType;
+}
FailureOr<xegpu::TensorDescType>
getDistributedTensorDescType(xegpu::TensorDescType originalT,
@@ -192,121 +200,117 @@ getDistributedTensorDescType(xegpu::TensorDescType originalT,
}
} // namespace
-// LogicalResult
-// WarpOpStoreNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-// PatternRewriter &rewriter) const {
-// auto yield = cast<vector::YieldOp>(
-// warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
-// Operation *lastNode = yield->getPrevNode();
-// auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
-// if (!storeOp)
-// return failure();
-
-// auto origType = storeOp.getTensorDescType();
-// xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
-// if (!sgMap)
-// return rewriter.notifyMatchFailure(
-// storeOp, "the source tensor descriptor lacks sg_map attribute");
-
-// if (storeOp.getTensorDescType().getShape().size() != 2)
-// return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
-// DBGS() << "Matched store_nd: " << storeOp << "\n";
-
-// auto distributedTypeOrFailure =
-// getDistributedVectorType(storeOp.getValueType(), sgMap);
-// if (failed(distributedTypeOrFailure))
-// return rewriter.notifyMatchFailure(storeOp,
-// "Failed to distribute the type");
-// VectorType newVectorType = distributedTypeOrFailure.value();
-
-// auto distributedDescTypeOrFailure = getDistributedTensorDescType(
-// storeOp.getTensorDescType(), sgMap,
-// storeOp.getTensorDescType().getMemorySpace());
-// if (failed(distributedDescTypeOrFailure))
-// return rewriter.notifyMatchFailure(storeOp,
-// "Failed to distribute the desc type");
-// xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
-
-// SmallVector<size_t> newRetIndices;
-// vector::WarpExecuteOnLane0Op newWarpOp =
-// moveRegionToNewWarpOpAndAppendReturns(
-// rewriter, warpOp,
-// ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
-// TypeRange{newTDescType, newVectorType}, newRetIndices);
-
-// rewriter.setInsertionPointAfter(newWarpOp);
-// auto newStoreOp =
-// cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
-// rewriter.eraseOp(storeOp);
-// newStoreOp.getTensorDescMutable().assign(
-// newWarpOp.getResult(newRetIndices[0]));
-// newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
-
-// return success();
-// }
-
-// LogicalResult WarpOpLoadNd::matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
-// PatternRewriter &rewriter) const {
-// OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
-// return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
-// });
-
-// if (!operand)
-// return rewriter.notifyMatchFailure(warpOp,
-// "warp result is not a xegpu::LoadNd op");
-
-// auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
-
-// if (loadOp.getPacked())
-// return rewriter.notifyMatchFailure(
-// loadOp, "Packed load distribution not supported");
-
-// xegpu::TensorDescType origType = loadOp.getTensorDescType();
-// xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
-// if (!sgMap)
-// return rewriter.notifyMatchFailure(
-// loadOp, "the source tensor descriptor lacks sg_map attribute");
-
-// auto origShape = origType.getShape();
-// if (origShape.size() != 2)
-// return rewriter.notifyMatchFailure(loadOp, "unsupported shape");
-
-// auto distributedTypeOrFailure =
-// getDistributedVectorType(loadOp.getType(), sgMap);
-// if (failed(distributedTypeOrFailure))
-// return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
-// VectorType newVectorType = distributedTypeOrFailure.value();
-
-// auto distributedDescTypeOrFailure =
-// getDistributedTensorDescType(loadOp.getTensorDescType(), sgMap,
-// loadOp.getTensorDescType().getMemorySpace());
-// if (failed(distributedDescTypeOrFailure))
-// return rewriter.notifyMatchFailure(loadOp,
-// "Failed to distribute the desc type");
-// xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
-
-// unsigned operandIdx = operand->getOperandNumber();
-
-// SmallVector<size_t> newRetIndices;
-// vector::WarpExecuteOnLane0Op newWarpOp =
-// moveRegionToNewWarpOpAndAppendReturns(
-// rewriter, warpOp, loadOp.getTensorDesc(), TypeRange{newTDescType},
-// newRetIndices);
-
-// rewriter.setInsertionPointAfter(newWarpOp);
-
-// auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
-// loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
-// loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
-// loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
-
-// newLoadOp.getTensorDescMutable().assign(
-// newWarpOp.getResult(newRetIndices[0]));
-// Value distributedVal = newWarpOp.getResult(operandIdx);
-// rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
-
-// return success();
-// }
+LogicalResult WarpOpStoreNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const {
+ auto yield = cast<gpu::YieldOp>(
+ warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+ if (!storeOp)
+ return failure();
+
+ auto origType = storeOp.getTensorDescType();
+ xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+ if (storeOp.getTensorDescType().getShape().size() != 2)
+ return rewriter.notifyMatchFailure(storeOp, "unsupported shape");
+ DBGS() << "Matched store_nd: " << storeOp << "\n";
+
+ auto distributedTypeOrFailure =
+ getDistributedVectorType(storeOp.getValueType(), sgMap);
+ if (failed(distributedTypeOrFailure))
+ return rewriter.notifyMatchFailure(storeOp,
+ "Failed to distribute the type");
+ VectorType newVectorType = distributedTypeOrFailure.value();
+
+ auto distributedDescTypeOrFailure = getDistributedTensorDescType(
+ storeOp.getTensorDescType(), sgMap,
+ storeOp.getTensorDescType().getMemorySpace());
+ if (failed(distributedDescTypeOrFailure))
+ return rewriter.notifyMatchFailure(storeOp,
+ "Failed to distribute the desc type");
+ xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, ValueRange{storeOp.getTensorDesc(), storeOp.getValue()},
+ TypeRange{newTDescType, newVectorType}, newRetIndices);
+
+ rewriter.setInsertionPointAfter(newWarpOp);
+ auto newStoreOp =
+ cast<xegpu::StoreNdOp>(rewriter.clone(*storeOp.getOperation()));
+ rewriter.eraseOp(storeOp);
+ newStoreOp.getTensorDescMutable().assign(
+ newWarpOp.getResult(newRetIndices[0]));
+ newStoreOp.getValueMutable().assign(newWarpOp.getResult(newRetIndices[1]));
+
+ return success();
+}
+
+LogicalResult WarpOpLoadNd::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const {
+ OpOperand *operand = getWarpResult(warpOp, [](Operation *op) {
+ return isa<xegpu::LoadNdOp>(op) && op->hasOneUse();
+ });
+
+ if (!operand)
+ return rewriter.notifyMatchFailure(warpOp,
+ "warp result is not a xegpu::LoadNd op");
+
+ auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+
+ if (loadOp.getPacked())
+ return rewriter.notifyMatchFailure(
+ loadOp, "Packed load distribution not supported");
+
+ xegpu::TensorDescType origType = loadOp.getTensorDescType();
+ xegpu::SGMapAttr sgMap = origType.getSGMapAttr();
+ if (!sgMap)
+ return rewriter.notifyMatchFailure(
+ loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+ auto origShape = origType.getShape();
+ if (origShape.size() != 2)
+ return rewriter.notifyMatchFailure(loadOp, "unsupported shape");
+
+ auto distributedTypeOrFailure =
+ getDistributedVectorType(loadOp.getType(), sgMap);
+ if (failed(distributedTypeOrFailure))
+ return rewriter.notifyMatchFailure(loadOp, "Failed to distribute the type");
+ VectorType newVectorType = distributedTypeOrFailure.value();
+
+ auto distributedDescTypeOrFailure =
+ getDistributedTensorDescType(loadOp.getTensorDescType(), sgMap,
+ loadOp.getTensorDescType().getMemorySpace());
+ if (failed(distributedDescTypeOrFailure))
+ return rewriter.notifyMatchFailure(loadOp,
+ "Failed to distribute the desc type");
+ xegpu::TensorDescType newTDescType = distributedDescTypeOrFailure.value();
+
+ unsigned operandIdx = operand->getOperandNumber();
+
+ SmallVector<size_t> newRetIndices;
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, loadOp.getTensorDesc(), TypeRange{newTDescType},
+ newRetIndices);
+
+ rewriter.setInsertionPointAfter(newWarpOp);
+
+ auto newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+ loadOp.getLoc(), newVectorType, loadOp.getTensorDesc(),
+ loadOp.getPackedAttr(), loadOp.getTransposeAttr(), loadOp.getL1HintAttr(),
+ loadOp.getL2HintAttr(), loadOp.getL3HintAttr());
+
+ newLoadOp.getTensorDescMutable().assign(
+ newWarpOp.getResult(newRetIndices[0]));
+ Value distributedVal = newWarpOp.getResult(operandIdx);
+ rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+
+ return success();
+}
LogicalResult
WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -369,10 +373,9 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
getAsIndexOpFoldResult(rewriter.getContext(), distributedShape);
SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp =
- moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
- newRetIndices);
+ gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+ rewriter, warpOp, descOp.getSource(), descOp.getSourceType(),
+ newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
auto subview = rewriter.create<memref::SubViewOp>(
@@ -393,6 +396,6 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
void xegpu::populateXeGPUDistributePatterns(RewritePatternSet &patterns) {
patterns.add<WarpOpTensorDescOp>(patterns.getContext());
- // patterns.add<WarpOpStoreNd>(patterns.getContext());
- // patterns.add<WarpOpLoadNd>(patterns.getContext());
+ patterns.add<WarpOpStoreNd>(patterns.getContext());
+ patterns.add<WarpOpLoadNd>(patterns.getContext());
}
>From 07f9f9f77953c4278912afd12910f2264f3e36e3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 22:37:28 +0000
Subject: [PATCH 6/7] fix
---
.../XeGPU/Transforms/XeGPUDistribute.cpp | 29 ++----------------
mlir/test/Dialect/XeGPU/xegpu-distribute.mlir | 30 ++++++++-----------
2 files changed, 16 insertions(+), 43 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 95610a9af7ac85..297cff8d0144d5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -341,24 +341,6 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks sg_map attribute");
- auto layout = sgMap.getWiLayout();
-
- // Calculate the offset within tensor descriptor for the current lane_id. The
- // access to proper element for a work item is done through a lane-specific
- // subview (tdesc offsets are used as base, lane shift is added on top).
- auto laneid = warpOp.getLaneid();
- auto xDim =
- rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), layout[0]);
- auto shiftx = rewriter.create<arith::RemUIOp>(laneid.getLoc(), laneid, xDim);
- auto shifty = rewriter.create<arith::DivUIOp>(laneid.getLoc(), laneid, xDim);
-
- auto basex = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
- descOffsets[0]);
- auto basey = getValueOrCreateConstantIndexOp(rewriter, laneid.getLoc(),
- descOffsets[1]);
- auto offsetx = rewriter.create<arith::AddIOp>(laneid.getLoc(), shiftx, basex);
- auto offsety = rewriter.create<arith::AddIOp>(laneid.getLoc(), shifty, basey);
-
auto distributedDescTypeOrFailure = getDistributedTensorDescType(
descOp.getType(), sgMap, descOp.getType().getMemorySpace());
if (failed(distributedDescTypeOrFailure))
@@ -378,15 +360,10 @@ WarpOpTensorDescOp::matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
- auto subview = rewriter.create<memref::SubViewOp>(
- newWarpOp.getLoc(), srcTypedVal, getAsOpFoldResult({offsetx, offsety}),
- overwriteSizes, overwriteStrides);
- subview.getSourceMutable().assign(newWarpOp.getResult(newRetIndices[0]));
-
- auto zero = rewriter.create<arith::ConstantIndexOp>(laneid.getLoc(), 0);
auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
- newWarpOp.getLoc(), newTDescType, subview,
- getAsOpFoldResult({zero, zero}));
+ newWarpOp.getLoc(), newTDescType,
+ dyn_cast<TypedValue<MemRefType>>(newWarpOp.getResult(newRetIndices[0])),
+ descOffsets);
Value distributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(distributedVal, newDescOp);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
index f9efda80ab6468..755f4922bfaa23 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-distribute.mlir
@@ -8,9 +8,9 @@
// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}}, %{{.*}} : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x2xf16>)
// CHECK: ^bb0(%[[src:.*]]: vector<24x32xf16>, %[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
-// CHECK: gpu.yield%[[dst]], %[[src]] : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x32xf16>
+// CHECK: gpu.yield %[[dst]], %[[src]] : !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, vector<24x32xf16>
// CHECK: xegpu.store_nd %[[res]]#1, %[[res]]#0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> :
-// CHECK-SAME: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) -> () {
%laneid = gpu.lane_id
@@ -23,7 +23,6 @@ func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tens
}
// -----
-
#sg_map_16 = #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
@@ -33,7 +32,7 @@ func.func @test_store_nd_distribution(%src: vector<24x32xf16>, %dst: !xegpu.tens
// CHECK-SAME: -> (vector<24x2xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
// CHECK: ^bb0(%[[dst:.*]]: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>)
// CHECK: %[[dead:.*]] = xegpu.load_nd
-// CHECK: gpu.yield%[[dead]], %[[dst]] : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
+// CHECK: gpu.yield %[[dead]], %[[dst]] : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
// CHECK: %[[load:.*]] = xegpu.load_nd %[[res]]#1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> :
// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>> -> vector<24x2xf16>
// CHECK: return %[[load]]
@@ -56,26 +55,23 @@ func.func @test_load_nd_distribution(%dst: !xegpu.tensor_desc<24x32xf16, #blk_td
#blk_tdesc = #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>
// CHECK-LABEL: test_create_nd_desc_distribution
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C12:.*]] = arith.constant 12 : index
// CHECK: %[[laneid:.*]] = gpu.lane_id
// CHECK: %[[res:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[16] args(%{{.*}} : memref<24x32xf16>)
-// CHECK-SAME: -> (!xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>)
+// CHECK-SAME: -> (!xegpu.tensor_desc<12x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>)
// CHECK: ^bb0(%[[dst:.*]]: memref<24x32xf16>)
// CHECK: %[[dead:.*]] = xegpu.create_nd_tdesc
-// CHECK: gpu.yield%[[dead]], %[[dst]] :
-// CHECK-SAME: !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>, memref<24x32xf16>
-// CHECK: %[[view:.*]] = memref.subview %[[res]]#1[%[[C0]], %[[laneid]]] [24, 2] [1, 1] : memref<24x32xf16> to memref<24x2xf16, strided<[32, 1], offset: ?>>
-// CHECK: %[[desc:.*]] = xegpu.create_nd_tdesc %[[view]][0, 0] : memref<24x2xf16, strided<[32, 1], offset: ?>>
-// CHECK-SAME: -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<memory_space = global, array_length = 1 : i64, boundary_check = true>, #xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>>
-// CHECK: return %[[desc]]
+// CHECK: gpu.yield %[[dead]], %[[dst]] :
+
-func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+func.func @test_create_nd_desc_distribution(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>) {
%laneid = gpu.lane_id
+ %c12 = arith.constant 12 : index
%r = gpu.warp_execute_on_lane_0(%laneid)[16]
- args(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>) {
+ args(%dst: memref<24x32xf16>) -> (!xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>) {
^bb0(%arg0: memref<24x32xf16>):
- %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
- gpu.yield%0 : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+ %0 = xegpu.create_nd_tdesc %arg0[%c12, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>
+ gpu.yield%0 : !xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>
}
- return %r : !xegpu.tensor_desc<24x32xf16, #blk_tdesc, #sg_map_16>
+ return %r : !xegpu.tensor_desc<12x32xf16, #blk_tdesc, #sg_map_16>
}
\ No newline at end of file
>From b842f331ffab253aaf8924b5a6574d35c4fdcf9d Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Thu, 12 Dec 2024 22:45:02 +0000
Subject: [PATCH 7/7] fix
---
.../XeGPU/Transforms/XeGPUDistribute.cpp | 200 +++++++++---------
1 file changed, 99 insertions(+), 101 deletions(-)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
index 297cff8d0144d5..e3f48919c39266 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUDistribute.cpp
@@ -31,46 +31,46 @@ using namespace mlir;
namespace {
bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
-// /// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by
-// the
-// /// yield op (and should be cleaned up later with dce). The yield op will
-// bypass
-// /// the create_nd_tdesc's arguments.
-// /// The rewrite will create a subview of the size used by a single work item
-// and
-// /// appropriate offset. The distributed create_nd_tdesc points into the
-// subview
-// /// without offset. The tensor descriptor types is distributed according to
-// /// sg_map attribute.
-// ///
-// /// Example:
-// ///
-// /// ```
-// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-// /// %r = vector.warp_execute_on_lane_0(%laneid) ->
-// /// (!xegpu.tensor_desc<4x8xf32>) {
-// /// ...
-// /// %td = xegpu.create_nd_tdesc %arg0[0, 0]
-// /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
-// /// vector.yield %td
-// /// }
-// /// ```
-// /// To
-// /// ```
-// /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
-// /// ...
-// /// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
-// /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
-// /// vector.yield %arg0, %dead
-// /// }
-// /// %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
-// /// : memref<4x8xf32> to memref<4x1xf32>
-// /// %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
-// /// -> !xegpu.tensor_desc<4x1xf32>
-// ///
-// /// ```
+/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+/// `vector.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the
+/// yield op (and should be cleaned up later with dce). The yield op will
+/// bypass
+/// the create_nd_tdesc's arguments.
+/// The rewrite will create a subview of the size used by a single work item
+/// and
+/// appropriate offset. The distributed create_nd_tdesc points into the
+/// subview
+/// without offset. The tensor descriptor types is distributed according to
+/// sg_map attribute.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = vector.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32>) {
+/// ...
+/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// vector.yield %td
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// vector.yield %arg0, %dead
+/// }
+/// %view = memref.subview %r#0[0, %laneid] [4, 1] [1, 1]
+/// : memref<4x8xf32> to memref<4x1xf32>
+/// %td = xegpu.create_nd_tdesc %view[0, 0]: memref<4x1xf32>
+/// -> !xegpu.tensor_desc<4x1xf32>
+///
+/// ```
struct WarpOpTensorDescOp final
: public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
@@ -78,35 +78,35 @@ struct WarpOpTensorDescOp final
PatternRewriter &rewriter) const override;
};
-// /// Sink a store_nd feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0`. In case arguments for the store are
-// passed
-// /// through the warp op interface they would be propagated as returned
-// values.
-// /// Both the stored vector type and tensor descriptor types are distributed
-// /// according to sg_map attribute.
-// ///
-// /// Example:
-// ///
-// /// ```
-// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-// /// vector.warp_execute_on_lane_0(%laneid) -> () {
-// /// ...
-// /// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
-// /// !xegpu.tensor_desc<4x8xf32>
-// /// vector.yield
-// /// }
-// /// ```
-// /// To
-// /// ```
-// /// %r = vector.warp_execute_on_lane_0(%laneid) -> () {
-// /// ...
-// /// vector.yield
-// /// }
-// /// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
-// !xegpu.tensor_desc<4x1xf32>
-// ///
-// /// ```
+/// Sink a store_nd feeding into vector.yield op for the enclosing
+/// `vector.warp_execute_on_lane_0`. In case arguments for the store are
+/// passed
+/// through the warp op interface they would be propagated as returned
+/// values.
+/// Both the stored vector type and tensor descriptor types are distributed
+/// according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// vector.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+/// !xegpu.tensor_desc<4x8xf32>
+/// vector.yield
+/// }
+/// ```
+/// To
+/// ```
+/// %r = vector.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// vector.yield
+/// }
+/// xegpu.store_nd %arg0, %arg1: vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x1xf32>
+///
+/// ```
struct WarpOpStoreNd final
: public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
@@ -114,38 +114,36 @@ struct WarpOpStoreNd final
PatternRewriter &rewriter) const override;
};
-// /// Clone a load_nd feeding into vector.yield op for the enclosing
-// /// `vector.warp_execute_on_lane_0` and put it after the warp op.
-// /// The warp op will still contain the original op that will not be used by
-// the
-// /// yield op (and should be cleaned up later with dce). The yield op will
-// bypass
-// /// the load's arguments.
-// /// Both the loaded vector type and tensor descriptor types are distributed
-// /// according to sg_map attribute.
-// ///
-// /// Example:
-// ///
-// /// ```
-// /// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
-// /// %r = vector.warp_execute_on_lane_0(%laneid) ->
-// /// (!xegpu.tensor_desc<4x8xf32>) {
-// /// ...
-// /// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
-// /// vector<4x8xf32> vector.yield %ld
-// /// }
-// /// ```
-// /// To
-// /// ```
-// /// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
-// /// ...
-// /// %dead = xegpu.load_nd %arg0, %arg1:
-// /// !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
-// /// vector.yield %arg0, %arg1
-// /// }
-// /// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>, !xegpu.tensor_desc<4x1xf32>
-// ///
-// /// ```
+/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// `vector.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later with dce). The yield op will
+/// bypass the load's arguments. Both the loaded vector type and tensor
+/// descriptor types are distributed according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = vector.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32>) {
+/// ...
+/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32>,
+/// vector<4x8xf32> vector.yield %ld
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = vector.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// %dead = xegpu.load_nd %arg0, %arg1:
+/// !xegpu.tensor_desc<4x8xf32>, vector<4x8xf32>
+/// vector.yield %arg0, %arg1
+/// }
+/// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x1xf32>
+///
+/// ```
struct WarpOpLoadNd final : public OpRewritePattern<gpu::WarpExecuteOnLane0Op> {
using OpRewritePattern<gpu::WarpExecuteOnLane0Op>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
More information about the Mlir-commits
mailing list