[Mlir-commits] [mlir] [mlir][xegpu] XeGPU distribution patterns for load_nd, store_nd, and create_nd_tdesc. (PR #119783)
Adam Siemieniuk
llvmlistbot at llvm.org
Thu Feb 6 09:34:16 PST 2025
================
@@ -0,0 +1,353 @@
+//=- XeGPUSubgroupDistribute.cpp - ditribute XeGPU ops to work items *-C++-*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/IR/Value.h"
+
+#define DEBUG_TYPE "xegpu-distribute"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+bool divisible(APInt lhs, APInt rhs) { return !lhs.urem(rhs); }
+
+/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op will
+/// still contain the original op that will not be used by the yield op (and
+/// should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because it
+/// is a uniform value accorss all work items within the subgroup.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (!xegpu.tensor_desc<4x8xf32>) {
+/// ...
+/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// vector.yield %td
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+/// vector.yield %arg0, %dead
+/// }
+/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+/// -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct SubgroupOpTensorDescOp final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
+};
+
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`. In
+/// case arguments for the store are passed through the warp op interface they
+/// would be propagated as returned values. Only the source vector for the store
+/// is distributed according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+/// !xegpu.tensor_desc<4x8xf32>
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32>
+/// }
+/// xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+/// !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct SubgroupOpStoreNd final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
+};
+
+/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later with dce). The yield op will
+/// bypass the load's arguments. Only the loaded vector is distributed according
+/// to sg_map attribute and, tensor descriptor types is not distributed.
+///
+/// Example:
+///
+/// ```
+/// #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
+/// (vector<4x1xf32>) {
+/// ...
+/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32> ->
+/// vector<4x8xf32>
+/// gpu.yield %ld
+/// }
+/// ```
+/// To
+/// ```
+/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+/// ...
+/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32> ->
+/// vector<4x8xf32> gpu.yield %arg0, %arg1
+/// }
+/// %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
+///
+/// ```
+struct SubgroupOpLoadNd final : public gpu::WarpDistributionPattern {
+ using gpu::WarpDistributionPattern::WarpDistributionPattern;
+ LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+ PatternRewriter &rewriter) const override;
+};
+
+/// Returns the distributed vector type for a source vector type according to
+/// the sg_map attribute.
+FailureOr<VectorType> getDistributedVectorType(VectorType originalT,
+ xegpu::SGMapAttr sgMap) {
+ llvm::SmallVector<int64_t, 2> distributedShape;
+ auto layout = sgMap.getWiLayout();
+ auto shape = originalT.getShape();
+ for (const auto [l, o] : llvm::zip_equal(layout, shape)) {
+ if (!divisible(APInt(64, o), APInt(64, l)))
+ return failure();
+ distributedShape.push_back(o / l);
+ }
+ auto newVectorType =
+ VectorType::get(distributedShape, originalT.getElementType(),
+ originalT.getScalableDims());
+ return newVectorType;
+}
+
+// Returns the distributed tensor descriptor type for a source tensor descriptor
+// type according to the sg_map attribute. Note that tensor descriptor type is
+// distributed only for the scattered case. For XeGPU ND operaions
+// (create_nd_tdesc, load_nd, store_nd), tensor descriptor is considered uniform
+// across all work items within the subgroup and therefore is not distributed.
+FailureOr<xegpu::TensorDescType>
+getDistributedTensorDescType(xegpu::TensorDescType originalT,
+ xegpu::SGMapAttr sgMap,
+ xegpu::MemorySpace memSpace) {
----------------
adam-smnk wrote:
I take that `memSpace` is there to propagate it to the newly created type. If so, it's unused atm.
Anyway, does it need to be a separate argument at all or could it be taken directly from `originalT`?
https://github.com/llvm/llvm-project/pull/119783
More information about the Mlir-commits
mailing list