[Mlir-commits] [mlir] [MLIR][XeGPU] Add unroll patterns and blocking pass for XeGPU [2/N] (PR #140163)

Wed May 28 11:44:31 PDT 2025

================
@@ -0,0 +1,331 @@
+//===---- XeGPUBlocking.cpp ---- XeGPU Blocking Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUBLOCKING
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-blocking"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+
+namespace {
+
+// reslove the unrealized conversion cast ops generated when doing SCF
+// Structural Type Conversion. It will have two formats, N:1 vector
+// cast and 1:N vector cast. vector::insert_strided_slice ops will be
+// used for the first case, and vector::extract_strided_slice ops will be
+// used for the second case.
+static void
+resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
+  ValueRange inputs = castOp.getInputs();
+  ValueRange outputs = castOp.getOutputs();
+  if (inputs.empty() || outputs.empty()) {
+    LDBG("erase unrealized conversion cast op has no inputs/outputs.");
+    castOp->erase();
+    return;
+  }
+
+  VectorType inputTy = dyn_cast<VectorType>(inputs[0].getType());
+  VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
+  if (!inputTy || !outputTy) {
+    LDBG("skip unrealized conversion cast op has non-vector inputs/outputs.");
+    return;
+  }
+
+  // We only interest in the case where all inputs and outputs have the
+  // identical types
+  if (llvm::any_of(castOp->getOperandTypes(),
+                   [&](Type t) { return t != inputTy; }) ||
+      llvm::any_of(castOp->getResultTypes(),
+                   [&](Type t) { return t != outputTy; })) {
+    LDBG("skip unrealized conversion cast op not emulating pack/unpack.");
+    return;
+  }
+
+  OpBuilder builder(castOp);
+  if (inputs.size() > 1 && outputs.size() == 1) {
+    // the castOp is emulating an unpack op
+    ArrayRef<int64_t> shape = outputTy.getShape();
+    Value result = xegpu::createVectorWithShapeFromValues(
+        builder, castOp.getLoc(), inputs, shape);
+    castOp->replaceAllUsesWith(ValueRange(result));
+    castOp->erase();
+  } else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
+    // the castOp is emulating a pack op
+    ArrayRef<int64_t> tileShape = outputTy.getShape();
+    SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
+        builder, castOp.getLoc(), inputs[0], tileShape);
+    castOp->replaceAllUsesWith(results);
+    castOp->erase();
+  }
+}
+
+/// Unroll XeGPU ops to their instruction-level representation.
+class XeGPUBlockingPass final
+    : public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
+public:
+  void runOnOperation() override;
+
+private:
+  // Get the tile shape for a given OpOperand or OpResult by examining the
+  // corresponding layout attribute. If layout is not present or is not a
+  // subgroup level layout, it returns std::nullopt.
+  template <typename T,
+            typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
+                                        std::is_same_v<T, OpResult>>>
+  std::optional<SmallVector<int64_t>>
+  getTileShape(const T &operandOrResult) const;
+
+  // Get the tile shape for a given operation.
+  std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;
+
+  // Determine if the operation requires unrolling. Return false if all operands
+  // and results have tile shapes identical to their original types. Otherwise,
+  // return true.
+  bool needsUnroll(Operation *op) const;
+};
+} // namespace
+
+template <typename T, typename>
+std::optional<SmallVector<int64_t>>
+XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
----------------
chencha3 wrote:

I need the OpOperand and OpResult here, but their indexes are used to retrieve the layout encoded in the operation (which are encoded as `layout_operand_i` and `layout_result_i` respectively)

https://github.com/llvm/llvm-project/pull/140163