[Mlir-commits] [mlir] [mlir][AMDGPU] Add canonicalization pattern to pack scales for ScaledMFMAOp (PR #155951)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Sep 17 22:35:51 PDT 2025
================
@@ -631,6 +634,144 @@ LogicalResult TransposeLoadOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// ScaledMFMAOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Check if the scales input is used in other scaled mfma's while they exist.
+/// If theyre unused then pack the scales.
+struct PackScales final : OpRewritePattern<ScaledMFMAOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(ScaledMFMAOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+ // If this use of a scale has a non zero opsel, packing has already been
+ // done.
+ auto checkIfUnpackable = [&](OpOperand &op) {
+ if (auto smfma = dyn_cast<ScaledMFMAOp>(op.getOwner())) {
+ switch (op.getOperandNumber()) {
+ case 3:
+ return smfma.getScalesIdxA() != 0;
+ case 4:
+ return smfma.getScalesIdxB() != 0;
+ default:
+ break;
+ }
+ }
+ return true;
+ };
+
+ auto setOpsel = [&](unsigned idx, int64_t val) {
+ switch (idx) {
+ case 3:
+ op.setScalesIdxA(val);
+ break;
+ case 4:
+ op.setScalesIdxB(val);
+ break;
+ default:
+ break;
+ }
+ };
+
+ // Obtain flat index from offsets and shape.
+ auto getIdxFromExtract = [](vector::ExtractOp op) {
+ ShapedType ty = dyn_cast<ShapedType>(op.getOperand(0).getType());
+ int64_t cumul = 1;
+ int64_t idx = 0;
+ for (auto [offset, size] :
+ reverse(llvm::zip_equal(op.getStaticPosition(), ty.getShape()))) {
+ idx += offset * cumul;
+ cumul *= size;
+ }
+ return idx;
+ };
+
+ // For every scale operand of this ScaledMFMAOp, if the scale follows the
+ // following pattern:
+ // (f8 here means f8E8M0FNU)
+ // %unit = vector.extract %ScaleSrc[offsets] : f8 from vector<...>
+ // %scale = vector.insert %unit, ... : f8 into vector<4xf8>
+ // amdgpu.scaled_mfma(%scale[0] * ...
+ //
+ // rewrite to:
+ //
+ // %reshaped = vector.shape_cast %ScaleSrc : vector<...> to vector<?xf8>
+ // %scale = vector.extract %reshaped[?] : vector<4xf8> from vector<?xf8>
+ // amdgpu.scaled_mfma(%scale[0-3] * ...
+ //
+ // This creates duplicate shape_casts for every use but these will be
+ // removed in CSE.
+ for (auto opIdx : std::array<int64_t, 2>({3, 4})) {
+ auto insertOp = op.getOperand(opIdx).getDefiningOp<vector::InsertOp>();
+ if (!insertOp) {
+ return rewriter.notifyMatchFailure(op,
+ "defining op not a vector.insert");
+ }
+ if (llvm::any_of(insertOp.getResult().getUses(), checkIfUnpackable)) {
----------------
Muzammiluddin-Syed-ECE wrote:
This was a poorly thought-out check for packing. I think a better check involves checking to see if a single scalar is being extracted from a vector of scales. This also removes the need for the CheckIfPackable function.
https://github.com/llvm/llvm-project/pull/155951
More information about the Mlir-commits
mailing list