[Mlir-commits] [mlir] [mlir][linalg] Preserve cast semantics during generic to matmul (PR #174757)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Jan 8 01:06:22 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-linalg
Author: Prathamesh Tagore (meshtag)
<details>
<summary>Changes</summary>
Infer signed/unsigned cast intent from cast ops in linalg.generic bodies and
propagate it via the matmul cast attribute. This could otherwise lead to
silent overflow/underflow errors in e2e execution.
Fixes a functional bug in https://github.com/llvm/llvm-project/issues/174517.
---
Full diff: https://github.com/llvm/llvm-project/pull/174757.diff
2 Files Affected:
- (modified) mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp (+71-4)
- (modified) mlir/test/Dialect/Linalg/specialize-generic-ops.mlir (+123)
``````````diff
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index 0c7b998ffcab9..6be1ca981bfd5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
@@ -134,14 +135,72 @@ static IndexMatchResult matchOperandMap(AffineMap map, unsigned rowDimIdx,
// All the variants expressed as pseudo regular expression:
// `linalg.{batch_}?matmul{_transpose_a | _transpose_b}?`
// have same number of ins/out, so its easy to stamp different versions.
+// `castTy` is an optional type function that indicates whether (and which) cast
+// attribute is needed for the named matmul op.
template <typename NamedOpTy>
-static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op) {
+static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op,
+ std::optional<TypeFn> castTy) {
+ SmallVector<NamedAttribute> castAttrVec;
+ // Only explicitly specify the cast attribute if the cast type exists and is
+ // pointing to unsigned cast (the default is signed cast for
+ // linalg.matmul/linalg.batch_matmul).
+ if (castTy.has_value() && *castTy == TypeFn::cast_unsigned)
+ castAttrVec = {rewriter.getNamedAttr(
+ "cast", TypeFnAttr::get(rewriter.getContext(), *castTy))};
+
LinalgOp namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(
op, ValueRange{op.getDpsInputs()[0], op.getDpsInputs()[1]},
- ValueRange{op.getDpsInits()[0]});
+ ValueRange{op.getDpsInits()[0]}, castAttrVec);
return namedOp;
}
+// Determines the required cast type for the specialized matmul op (if any)
+// which is expressed in the form of the input linalg.generic op. Also audits
+// that there are no invalid cast ops for matmul inputs/outputs which can't be
+// expressed using the specialized op.
+static bool
+getAndAuditMatmulCastTy(GenericOp genericOp,
+ std::optional<TypeFn> &specializedOpCastTy) {
+ bool foundCastForMatmulOutput = false;
+ SmallVector<TypeFn> castTyFns;
+ genericOp.getBody()->walk([&](CastOpInterface castOp) {
+ // Collect forward slice of the cast op to check if it is for the matmul
+ // output.
+ SetVector<Operation *> forwardSlice;
+ getForwardSlice(castOp, &forwardSlice);
+
+ // If there is no multiplication op in the forward slice, then this cast
+ // op is for the matmul output. Cast ops on matmul output cannot be
+ // expressed by linalg.matmul and linalg.batch_matmul.
+ if (!llvm::any_of(forwardSlice, [](Operation *op) {
+ // We check explicitly for these multiplication ops in
+ // `specializeLinalgContractions()` to infer matmuls.
+ return isa<arith::MulIOp, arith::MulFOp, complex::MulOp>(op);
+ })) {
+ foundCastForMatmulOutput = true;
+ return WalkResult::interrupt();
+ }
+
+ // Determine the cast type.
+ if (isa<arith::ExtUIOp, arith::UIToFPOp, arith::FPToUIOp>(castOp))
+ castTyFns.push_back(TypeFn::cast_unsigned);
+ else if (isa<arith::ExtSIOp, arith::SIToFPOp, arith::FPToSIOp>(castOp))
+ castTyFns.push_back(TypeFn::cast_signed);
+
+ return WalkResult::advance();
+ });
+
+ if (!castTyFns.empty()) {
+ // If there were multiple different cast types found, then we can't express
+ // it correctly using linalg.matmul or linalg.batch_matmul ops. They only
+ // allow a single cast type for all inputs.
+ if (!llvm::all_equal(castTyFns))
+ return false;
+ specializedOpCastTy = castTyFns.front();
+ }
+ return !foundCastForMatmulOutput;
+}
+
// Converts linalg.generic to named linalg.*matmul* where possible.
static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
GenericOp genericOp) {
@@ -230,11 +289,19 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
(a == IndexMatchResult::Transposed && b == IndexMatchResult::Transposed))
return failure();
+ // Get the cast attribute for the named matmul op (if any).
+ std::optional<TypeFn> castTy;
+
+ // If there were invalid cast ops found for matmul, bail out. Else determine
+ // the cast type for the named matmul op (if any).
+ if (!getAndAuditMatmulCastTy(genericOp, castTy))
+ return failure();
+
/// Codegen the different matmul variants.
if (numOfBatchDims) {
- return replaceWithMatmulVariant<BatchMatmulOp>(rewriter, genericOp);
+ return replaceWithMatmulVariant<BatchMatmulOp>(rewriter, genericOp, castTy);
}
- return replaceWithMatmulVariant<MatmulOp>(rewriter, genericOp);
+ return replaceWithMatmulVariant<MatmulOp>(rewriter, genericOp, castTy);
}
/// Utility to specialize a `genericOp` with a convolution op of type `ConvOpTy`
diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index cf495a7d29b70..b1db1154fb357 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -124,3 +124,126 @@ func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>
}
// CHECK-LABEL: op_matvec
// CHECK: linalg.generic
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
+ %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+ %0 = linalg.generic
+ {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>) outs(%Out : tensor<16x32xi32>) {
+ ^bb0(%in: i16, %in_0: i64, %out: i32):
+ %1 = arith.extui %in : i16 to i32
+ %2 = arith.trunci %in_0 : i64 to i32
+ %3 = arith.muli %1, %2 : i32
+ %4 = arith.addi %out, %3 : i32
+ linalg.yield %4 : i32
+ } -> tensor<16x32xi32>
+ return %0 : tensor<16x32xi32>
+}
+
+// CHECK-LABEL: op_matmul_unsigned_cast
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
+ %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+ %0 = linalg.generic
+ {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi32>) {
+ ^bb0(%in: i16, %in_0: i16, %out: i32):
+ %1 = arith.extsi %in : i16 to i32
+ %2 = arith.extsi %in_0 : i16 to i32
+ %3 = arith.muli %1, %2 : i32
+ %4 = arith.addi %out, %3 : i32
+ linalg.yield %4 : i32
+ } -> tensor<16x32xi32>
+ return %0 : tensor<16x32xi32>
+}
+
+// CHECK-LABEL: op_matmul_signed_cast
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
+ %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+ %0 = linalg.generic
+ {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi32>) {
+ ^bb0(%in: i16, %in_0: i16, %out: i32):
+ %1 = arith.extui %in : i16 to i32
+ %2 = arith.extsi %in_0 : i16 to i32
+ %3 = arith.muli %1, %2 : i32
+ %4 = arith.addi %out, %3 : i32
+ linalg.yield %4 : i32
+ } -> tensor<16x32xi32>
+ return %0 : tensor<16x32xi32>
+}
+
+// CHECK-LABEL: op_matmul_mixed_cast
+// CHECK: linalg.generic
+// CHECK-NOT: linalg.matmul
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @op_matmul_output_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
+ %Out: tensor<16x32xi64>) -> tensor<16x32xi64> {
+ %0 = linalg.generic
+ {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi64>) {
+ ^bb0(%in: i16, %in_0: i16, %out: i64):
+ %1 = arith.extsi %in : i16 to i32
+ %2 = arith.extsi %in_0 : i16 to i32
+ %3 = arith.trunci %out : i64 to i32
+ %4 = arith.muli %1, %2 : i32
+ %5 = arith.addi %3, %4 : i32
+ %6 = arith.extsi %5 : i32 to i64
+ linalg.yield %6 : i64
+ } -> tensor<16x32xi64>
+ return %0 : tensor<16x32xi64>
+}
+
+// CHECK-LABEL: op_matmul_output_cast
+// CHECK: linalg.generic
+// CHECK-NOT: linalg.matmul
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
+ %B: tensor<8x32xi32>,
+ %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
+ %0 = linalg.generic
+ {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+ ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>) outs(%Out : tensor<16x32xf32>) {
+ ^bb0(%in: i32, %in_0: i32, %out: f32):
+ %1 = arith.bitcast %in : i32 to f32
+ %2 = arith.bitcast %in_0 : i32 to f32
+ %3 = arith.mulf %1, %2 : f32
+ %4 = arith.addf %out, %3 : f32
+ linalg.yield %4 : f32
+ } -> tensor<16x32xf32>
+ return %0 : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: op_matmul_bitcast_int_to_float
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul
``````````
</details>
https://github.com/llvm/llvm-project/pull/174757
More information about the Mlir-commits
mailing list