[Mlir-commits] [mlir] [mlir][linalg] Generic to category specialization (PR #184624)

Adam Siemieniuk llvmlistbot at llvm.org
Fri Mar 6 05:48:12 PST 2026


https://github.com/adam-smnk updated https://github.com/llvm/llvm-project/pull/184624

>From 40e735fba6452b6cec8c7eefce64857ad5eaeebc Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Wed, 4 Mar 2026 13:40:40 +0100
Subject: [PATCH 01/13] [mlir][linalg] Generic to category specialization

Adds initial support for generic to category linalg morphism.
Only conversion to contraction op is supported for now.
---
 .../Dialect/Linalg/IR/LinalgStructuredOps.td  |   7 +
 mlir/include/mlir/Dialect/Linalg/Passes.td    |   6 +-
 .../Dialect/Linalg/Transforms/Transforms.h    |  34 ++--
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |  10 +-
 .../Dialect/Linalg/Transforms/MorphOps.cpp    |  12 +-
 .../Dialect/Linalg/Transforms/Specialize.cpp  |  87 ++++++----
 .../Linalg/specialize-generic-ops.mlir        | 150 +++++++++++++++++-
 7 files changed, 249 insertions(+), 57 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 4948bfffad5e0..5998f736ced34 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -889,6 +889,13 @@ def ContractOp : LinalgStructuredBase_Op<"contract", [
 
   let skipDefaultBuilders = 1;
   let builders = [
+    OpBuilder<
+      (ins "ValueRange":$inputs, "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs,
+                          attributes, regionBuilder);
+      }]>,
     OpBuilder<(ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
       "ValueRange":$outputs, "ArrayAttr":$indexingMaps,
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index f48ea9849e237..26638b2a644c4 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -70,8 +70,10 @@ def LinalgMorphOpsPass : Pass<"linalg-morph-ops"> {
     
     // Specialization path is not guaranteed.
     Option<"genericToNamed", "generic-to-named", "bool", /*default=*/"false",
-           "convert linalg.generic to equivalent named ops"> ];
-    //  TODOs: `generic-to-category`, `category-to-named`
+           "convert linalg.generic to equivalent named ops">,
+    Option<"genericToCategory", "generic-to-category", "bool", /*default=*/"false",
+           "convert linalg.generic to equivalent category ops"> ];
+    //  TODOs: `category-to-named`
 }
 
 def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops">,
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index fb9cede670801..1e63455fae096 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -923,10 +923,15 @@ FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
 FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
                                        LinalgOp linalgOp);
 
-/// Create a namedOp from the given GenericOp and replace the GenericOp.
-/// Currently we can specialize only trivial linalg copy operations.
-FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter,
-                                        GenericOp genericOp);
+struct SpecializationOptions {
+  // Specialize generics to category ops.
+  bool emitCategoryOps = false;
+};
+
+/// Replace the given GenericOp with a namedOp or categoryOp.
+FailureOr<LinalgOp>
+specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
+                    const SpecializationOptions options = {});
 
 /// Create a new buffer using the `allocationFn` provided. The size of this
 /// buffer is either the original subview size when 'useOriginalSubviewSize' is
@@ -1718,17 +1723,24 @@ struct LinalgGeneralizationPattern
 };
 
 struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
-  using OpRewritePattern<GenericOp>::OpRewritePattern;
+
+  LinalgSpecializationPattern(MLIRContext *context,
+                              const SpecializationOptions &options = {},
+                              PatternBenefit benefit = 1)
+      : OpRewritePattern<GenericOp>(context, benefit), options(options) {}
 
   FailureOr<GenericOp>
   returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const {
-    return specializeGenericOp(rewriter, op);
+    return specializeGenericOp(rewriter, op, options);
   }
 
   LogicalResult matchAndRewrite(GenericOp op,
                                 PatternRewriter &rewriter) const override {
     return returningMatchAndRewrite(op, rewriter);
   }
+
+private:
+  SpecializationOptions options;
 };
 
 /// Vectorization pattern for memref::CopyOp.
@@ -1938,13 +1950,13 @@ void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns);
 void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns);
 
 /// Populates `patterns` with patterns to convert linalg.generic ops to named
-/// ops where possible. A linalg.generic can represent wide range and complex
-/// computations for which equivalent linalg named op may not exist e.g.
-/// linalg.generic that takes a tensor and computes a polynomial such as:
+/// or category ops where possible. A linalg.generic can represent wide range
+/// and complex computations for which equivalent linalg named op may not exist
+/// e.g. linalg.generic that takes a tensor and computes a polynomial such as:
 ///     p(x) = an*x^n + ... + a1x + a0
-/// There is no equivalent named op to convert to. Many such cases exist.
+/// There is no equivalent ops to convert to. Many such cases exist.
 void populateLinalgGenericOpsSpecializationPatterns(
-    RewritePatternSet &patterns);
+    RewritePatternSet &patterns, const SpecializationOptions &options = {});
 
 /// Populates `patterns` that convert linalg named ops e.g. `linalg.add`
 /// to equivalent `linalg.elementwise`.
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index bfc03cc7436df..67d7406987569 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -200,7 +200,10 @@ static void buildMatmulOp(OpBuilder &b, OperationState &state,
       llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
         return AffineMapAttr::get(map);
       });
-  state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  if (none_of(attributes, [](NamedAttribute attr) {
+        return attr.getName() == "indexing_maps";
+      }))
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }
@@ -217,7 +220,10 @@ static void buildBatchMatmulOp(OpBuilder &b, OperationState &state,
       llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
         return AffineMapAttr::get(map);
       });
-  state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  if (none_of(attributes, [](NamedAttribute attr) {
+        return attr.getName() == "indexing_maps";
+      }))
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
index f261ccb1415fe..17416b42c47ab 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
@@ -44,16 +44,16 @@ void LinalgMorphOpsPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
 
   // Lowering paths (named -> category -> generic)
-  if (namedToCategory) {
+  if (namedToCategory)
     populateLinalgNamedToElementwisePatterns(patterns);
-  }
-  if (namedToGeneric || categoryToGeneric) {
+  if (namedToGeneric || categoryToGeneric)
     populateLinalgNamedOpsGeneralizationPatterns(patterns);
-  }
 
   // Lifting paths (named <- category <- generic)
-  if (genericToNamed) {
-    populateLinalgGenericOpsSpecializationPatterns(patterns);
+  if (genericToNamed || genericToCategory) {
+    SpecializationOptions opts;
+    opts.emitCategoryOps = genericToCategory;
+    populateLinalgGenericOpsSpecializationPatterns(patterns, opts);
   }
 
   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index b4de2bb1e1169..93e925fdcf061 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -141,18 +141,23 @@ template <typename NamedOpTy>
 static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op,
                                          std::optional<TypeFn> castTy,
                                          ArrayRef<AffineMap> indexingMaps) {
-  SmallVector<NamedAttribute> castAttrVec;
+  SmallVector<NamedAttribute> attributes;
   // Only explicitly specify the cast attribute for unsigned cast; signed is
   // the default for linalg.matmul/linalg.batch_matmul.
-  if (castTy.has_value() && *castTy == TypeFn::cast_unsigned)
-    castAttrVec = {rewriter.getNamedAttr(
-        "cast", TypeFnAttr::get(rewriter.getContext(), *castTy))};
+  if (castTy.has_value() && *castTy == TypeFn::cast_unsigned) {
+    auto castAttr = rewriter.getNamedAttr(
+        "cast", TypeFnAttr::get(rewriter.getContext(), *castTy));
+    attributes.push_back(castAttr);
+  }
 
-  auto namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(
-      op, ValueRange{op.getDpsInputs()[0], op.getDpsInputs()[1]},
-      ValueRange{op.getDpsInits()[0]}, castAttrVec);
+  // Set the original generic's maps to preserve transposed operand semantics.
+  auto indexingMapsAttr = rewriter.getNamedAttr(
+      "indexing_maps", rewriter.getArrayAttr(indexingMaps));
+  attributes.push_back(indexingMapsAttr);
 
-  namedOp.setIndexingMapsAttr(rewriter.getAffineMapArrayAttr(indexingMaps));
+  LinalgOp namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(
+      op, ValueRange{op.getDpsInputs()[0], op.getDpsInputs()[1]},
+      ValueRange{op.getDpsInits()[0]}, attributes);
 
   return namedOp;
 }
@@ -208,7 +213,8 @@ static std::optional<TypeFn> getCastTypeForMatmulLikeOp(GenericOp genericOp) {
 
 // Converts linalg.generic to named linalg.*matmul* where possible.
 static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
-                                                        GenericOp genericOp) {
+                                                        GenericOp genericOp,
+                                                        bool emitCategoryOp) {
   if (genericOp.getNumDpsInputs() != 2 || genericOp.getNumDpsInits() != 1)
     return failure();
 
@@ -218,6 +224,29 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
                    [](AffineMap m) { return !m.isProjectedPermutation(); }))
     return failure();
 
+  // Only mul+add contraction is supported.
+  if (!mlir::linalg::detail::isContractionBody(
+          *genericOp.getBlock(), [](Operation *first, Operation *second) {
+            return (isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
+                   (isa<arith::MulIOp>(first) && isa<arith::AddIOp>(second)) ||
+                   (isa<complex::MulOp>(first) && isa<complex::AddOp>(second));
+          }))
+    return failure();
+
+  // Determine the cast type for the named matmul op, or bail out if casts
+  // cannot be represented by the named op.
+  std::optional<TypeFn> castTy = getCastTypeForMatmulLikeOp(genericOp);
+  if (!castTy)
+    return rewriter.notifyMatchFailure(
+        genericOp, "contains invalid cast ops for the named matmul op");
+
+  // In case of category op, wider range of representation is supported.
+  if (emitCategoryOp)
+    return replaceWithMatmulVariant<ContractOp>(
+        rewriter, genericOp, castTy, genericOp.getIndexingMapsArray());
+
+  // Further checks for named variants.
+  //
   // Linalg generic contraction can be across multiple axis e.g.
   // ```
   //      linalg.generic
@@ -244,14 +273,6 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
   if (dims.m.size() != 1 || dims.n.size() != 1 || dims.k.size() != 1)
     return failure();
 
-  if (!mlir::linalg::detail::isContractionBody(
-          *genericOp.getBlock(), [](Operation *first, Operation *second) {
-            return (isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
-                   (isa<arith::MulIOp>(first) && isa<arith::AddIOp>(second)) ||
-                   (isa<complex::MulOp>(first) && isa<complex::AddOp>(second));
-          }))
-    return failure();
-
   // Check rank of operands
   auto indexingMaps = genericOp.getIndexingMapsArray();
   if (llvm::any_of(indexingMaps, [&dims](AffineMap m) {
@@ -290,13 +311,6 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
   if (llvm::is_contained({a, b, c}, IndexMatchResult::Mismatch))
     return failure();
 
-  // Determine the cast type for the named matmul op, or bail out if casts
-  // cannot be represented by the named op.
-  std::optional<TypeFn> castTy = getCastTypeForMatmulLikeOp(genericOp);
-  if (!castTy)
-    return rewriter.notifyMatchFailure(
-        genericOp, "contains invalid cast ops for the named matmul op");
-
   // Build indexing maps for the named op in its canonical dimension ordering
   auto *ctx = genericOp.getContext();
   unsigned numLoopDims = numOfBatchDims + 3;
@@ -431,8 +445,20 @@ static FailureOr<LinalgOp> specializeLinalgConvolutions(RewriterBase &rewriter,
 //===----------------------------------------------------------------------===//
 // Categorize linalg generic to named op where possible.
 //===----------------------------------------------------------------------===//
-FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
-                                                      GenericOp genericOp) {
+FailureOr<LinalgOp>
+mlir::linalg::specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
+                                  const SpecializationOptions options) {
+  // Contraction - e.g. matmul
+  if (isaContractionOpInterface(genericOp)) {
+    return specializeLinalgContractions(rewriter, genericOp,
+                                        options.emitCategoryOps);
+  }
+
+  // Early exit in case of category specialization.
+  // TODO: Remove when all variants account for both named and category.
+  if (options.emitCategoryOps)
+    return failure();
+
   // Copy
   if (isaCopyOpInterface(genericOp)) {
     LinalgOp namedOp = rewriter.replaceOpWithNewOp<CopyOp>(
@@ -501,11 +527,6 @@ FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
     }
   }
 
-  // Contraction - e.g. matmul
-  if (isaContractionOpInterface(genericOp)) {
-    return specializeLinalgContractions(rewriter, genericOp);
-  }
-
   // Convolution - e.g. *conv/pooling*
   if (isaConvolutionOpInterface(genericOp)) {
     return specializeLinalgConvolutions(rewriter, genericOp);
@@ -534,6 +555,6 @@ void LinalgSpecializeGenericOpsPass::runOnOperation() {
 }
 
 void mlir::linalg::populateLinalgGenericOpsSpecializationPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<LinalgSpecializationPattern>(patterns.getContext());
+    RewritePatternSet &patterns, const SpecializationOptions &options) {
+  patterns.add<LinalgSpecializationPattern>(patterns.getContext(), options);
 }
diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index 5c58a5fedd639..a31901d45a8a6 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s -split-input-file --linalg-specialize-generic-ops | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-named | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-category | FileCheck %s --check-prefix=CATEGORY
 
 #umap = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
@@ -17,6 +18,10 @@ func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tenso
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.exp ins(%[[A]] : tensor<?x?x?xf32>) outs(%[[Out]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 
+// Not supported yet.
+// CATEGORY-LABEL: unary_op_exp
+// CATEGORY: linalg.generic
+
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
@@ -36,6 +41,10 @@ func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.div ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// Not supported yet.
+// CATEGORY-LABEL: binary_op_div
+// CATEGORY: linalg.generic
+
 // -----
 
 ///----------------------------------------------------------------------------------------
@@ -62,6 +71,17 @@ func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?x
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CATEGORY-LABEL: op_matmul
+// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // Cast-auditing tests: ensure we only specialize when the cast semantics can
 // be expressed by linalg.matmul, and use the cast attribute when needed.
 
@@ -84,6 +104,11 @@ func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 
+// CATEGORY-LABEL: op_matmul_unsigned_cast
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
+
 // Ensures truncation rounding is tolerated with unsigned cases.
 // Note: We only consider casts as conflicting if they have different
 // signedness behaviours, and then we do not specialize if they do
@@ -110,6 +135,11 @@ func.func @op_matmul_unsigned_cast_and_truncate(%A: tensor<16x8xi16>, %B: tensor
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 
+// CATEGORY-LABEL: op_matmul_unsigned_cast_and_truncate
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
+
 // Signed casts are the default, no cast attribute is required.
 func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                  %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
@@ -131,6 +161,11 @@ func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
 // CHECK-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 // CHECK: linalg.matmul
 
+// CATEGORY-LABEL: op_matmul_signed_cast
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-NOT: {cast =
+
 // Mixed signed/unsigned inputs cannot be encoded with a single cast attribute.
 func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                 %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
@@ -151,6 +186,10 @@ func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi
 // CHECK: linalg.generic
 // CHECK-NOT: linalg.matmul
 
+// CATEGORY-LABEL: negative_op_matmul_mixed_cast
+// CATEGORY: linalg.generic
+// CATEGORY-NOT: linalg.contract
+
 // Output-side casts are not representable by the named matmul ops.
 func.func @negative_op_matmul_output_cast(%A: tensor<16x8xi32>, %B: tensor<8x32xi32>,
                                  %Out: tensor<16x32xi64>) -> tensor<16x32xi64> {
@@ -171,6 +210,10 @@ func.func @negative_op_matmul_output_cast(%A: tensor<16x8xi32>, %B: tensor<8x32x
 // CHECK: linalg.generic
 // CHECK-NOT: linalg.matmul
 
+// CATEGORY-LABEL: negative_op_matmul_output_cast
+// CATEGORY: linalg.generic
+// CATEGORY-NOT: linalg.contract
+
 // Bitcasts are not modeled by the cast attribute, but should not block
 // specialization.
 // NOTE: Bitcasts are not preserved by the matmul named op during
@@ -196,6 +239,10 @@ func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.matmul
 
+// CATEGORY-LABEL: op_matmul_bitcast_int_to_float
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+
 // Signed float casts only use sitofp, which defaults to signed semantics.
 func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                        %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
@@ -217,6 +264,11 @@ func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16
 // CHECK-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 // CHECK: linalg.matmul
 
+// CATEGORY-LABEL: op_matmul_signed_cast_float
+// CATEGORY-NOT: linalg.generic
+// CATEGORY-NOT: linalg.contract{{.*}}{cast =
+// CATEGORY: linalg.contract
+
 // Unsigned float casts are expressed via uitofp and use the unsigned cast attr.
 func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                          %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
@@ -237,6 +289,10 @@ func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 
+// CATEGORY-LABEL: op_matmul_unsigned_cast_float
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract{{.*}}{cast = #linalg.type_fn<cast_unsigned>}
+
 // -----
 
 ///----------------------------------------------------------------------------------------
@@ -263,6 +319,16 @@ func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>, %Out:
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-LABEL: op_batch_matmul
+// CATEGORY-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[Out:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
 // Ensure that the unsigned cast path for cast detection is exercised for
 // batch_matmul as well.
 func.func @op_batch_matmul_unsigned_cast(%A: tensor<2x16x8xi16>,
@@ -287,9 +353,14 @@ func.func @op_batch_matmul_unsigned_cast(%A: tensor<2x16x8xi16>,
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.batch_matmul {cast = #linalg.type_fn<cast_unsigned>}
 
+// CATEGORY-LABEL: op_batch_matmul_unsigned_cast
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
+
 // -----
 
-// This is a multi-reduction linalg.generic and cannot be lifted to matrix multiply
+// A multi-reduction contraction.
 #mapA = affine_map<(m, n, k1, k2) -> (m, k1, k2)>
 #mapB = affine_map<(m, n, k1, k2) -> (k2, k1, n)>
 #mapC = affine_map<(m, n, k1, k2) -> (m, n)>
@@ -309,9 +380,14 @@ func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
   return %0 : tensor<10x40xf32>
 }
 
+// Cannot be lifted to named matrix multiply.
 // CHECK-LABEL: negative_op_multi_reduction
 // CHECK: linalg.generic
 
+// CATEGORY-LABEL: negative_op_multi_reduction
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+
 // -----
 
 // Batch dim not in identity position: batch dim d0 appears at result
@@ -332,12 +408,17 @@ func.func @negative_batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: t
   return %0 : tensor<2x4x16xf32>
 }
 
+// Cannot be lifted to named matrix multiply.
 // CHECK-LABEL: negative_batch_matmul_non_identity_batch
 // CHECK: linalg.generic
 
+// CATEGORY-LABEL: negative_batch_matmul_non_identity_batch
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+
 // -----
 
-// TODO: matvec
+// TODO: named matvec
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
@@ -355,6 +436,10 @@ func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>
 // CHECK-LABEL: op_matvec
 // CHECK: linalg.generic
 
+// CATEGORY-LABEL: op_matvec
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+
 // -----
 
 // Matmul transpose A: A is accessed as (k, m) instead of (m, k)
@@ -384,6 +469,10 @@ func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
 // CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// CATEGORY-LABEL: op_matmul_transpose_a
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+
 // -----
 
 // Matmul transpose B: B is accessed as (n, k) instead of (k, n)
@@ -413,6 +502,17 @@ func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
 // CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CATEGORY-LABEL: op_matmul_transpose_b
+// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // -----
 
 // Batch matmul transpose A: A is accessed as (b, k, m) instead of (b, m, k)
@@ -442,6 +542,17 @@ func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16x
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
 // CHECK-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
+// CATEGORY-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-LABEL: op_batch_matmul_transpose_a
+// CATEGORY-SAME: %[[A:.+]]: tensor<2x8x4xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+
 // -----
 
 // Batch matmul transpose B: B is accessed as (b, n, k) instead of (b, k, n)
@@ -471,6 +582,17 @@ func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8x
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
 // CHECK-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-LABEL: op_batch_matmul_transpose_b
+// CATEGORY-SAME: %[[A:.+]]: tensor<2x4x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+
 // -----
 
 // Both A and B transposed.
@@ -501,6 +623,17 @@ func.func @op_matmul_transpose_a_and_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
 // CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// CATEGORY-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CATEGORY-LABEL: op_matmul_transpose_a_and_b
+// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // -----
 
 // Output transposed: C is accessed as (n, m) instead of (m, n).
@@ -531,6 +664,17 @@ func.func @op_matmul_transposed_output(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
 // CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CATEGORY-DAG: #[[$MAP_TC:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+// CATEGORY-LABEL: op_matmul_transposed_output
+// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // -----
 
 // Matmul with non-canonical loop ordering.

>From 86ab2e103f8b1365642b62e9462eee27429d35ae Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Wed, 4 Mar 2026 15:48:04 +0100
Subject: [PATCH 02/13] Pass opts by reference

---
 mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h | 2 +-
 mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 1e63455fae096..53d6f1b1be0c1 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -931,7 +931,7 @@ struct SpecializationOptions {
 /// Replace the given GenericOp with a namedOp or categoryOp.
 FailureOr<LinalgOp>
 specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
-                    const SpecializationOptions options = {});
+                    const SpecializationOptions &options = {});
 
 /// Create a new buffer using the `allocationFn` provided. The size of this
 /// buffer is either the original subview size when 'useOriginalSubviewSize' is
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index 93e925fdcf061..5fd13b1755247 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -447,7 +447,7 @@ static FailureOr<LinalgOp> specializeLinalgConvolutions(RewriterBase &rewriter,
 //===----------------------------------------------------------------------===//
 FailureOr<LinalgOp>
 mlir::linalg::specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
-                                  const SpecializationOptions options) {
+                                  const SpecializationOptions &options) {
   // Contraction - e.g. matmul
   if (isaContractionOpInterface(genericOp)) {
     return specializeLinalgContractions(rewriter, genericOp,

>From f14674bd528f9e901dae3432e09715f32044c688 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Wed, 4 Mar 2026 16:00:51 +0100
Subject: [PATCH 03/13] Improve docs phrasing

---
 mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h | 2 +-
 mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp        | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 53d6f1b1be0c1..4a0e0d4eb50b8 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1954,7 +1954,7 @@ void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns);
 /// and complex computations for which equivalent linalg named op may not exist
 /// e.g. linalg.generic that takes a tensor and computes a polynomial such as:
 ///     p(x) = an*x^n + ... + a1x + a0
-/// There is no equivalent ops to convert to. Many such cases exist.
+/// There is no equivalent named op to convert to. Many such cases exist.
 void populateLinalgGenericOpsSpecializationPatterns(
     RewritePatternSet &patterns, const SpecializationOptions &options = {});
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index 5fd13b1755247..49ce621d6d36f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -150,7 +150,8 @@ static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op,
     attributes.push_back(castAttr);
   }
 
-  // Set the original generic's maps to preserve transposed operand semantics.
+  // Set the original generic's maps to preserve operand indexing semantics like
+  // transposition.
   auto indexingMapsAttr = rewriter.getNamedAttr(
       "indexing_maps", rewriter.getArrayAttr(indexingMaps));
   attributes.push_back(indexingMapsAttr);
@@ -240,7 +241,7 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
     return rewriter.notifyMatchFailure(
         genericOp, "contains invalid cast ops for the named matmul op");
 
-  // In case of category op, wider range of representation is supported.
+  // In case of category op, wider range of variants is supported.
   if (emitCategoryOp)
     return replaceWithMatmulVariant<ContractOp>(
         rewriter, genericOp, castTy, genericOp.getIndexingMapsArray());

>From a9f3b897fe06f933e6cc71573aabc08f13b85ade Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Wed, 4 Mar 2026 16:03:16 +0100
Subject: [PATCH 04/13] Mention default opt

---
 mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 4a0e0d4eb50b8..2fc083bf7b871 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -924,7 +924,7 @@ FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
                                        LinalgOp linalgOp);
 
 struct SpecializationOptions {
-  // Specialize generics to category ops.
+  // Specialize generics to category ops (default: named ops).
   bool emitCategoryOps = false;
 };
 

>From ed3ddd69051d12d1c31cebbec4596d0e05af39a1 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Thu, 5 Mar 2026 10:32:19 +0100
Subject: [PATCH 05/13] Rename matching test cases

---
 .../Dialect/Linalg/specialize-generic-ops.mlir | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index a31901d45a8a6..2e06f4987f7bc 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -364,9 +364,9 @@ func.func @op_batch_matmul_unsigned_cast(%A: tensor<2x16x8xi16>,
 #mapA = affine_map<(m, n, k1, k2) -> (m, k1, k2)>
 #mapB = affine_map<(m, n, k1, k2) -> (k2, k1, n)>
 #mapC = affine_map<(m, n, k1, k2) -> (m, n)>
-func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
-                                       %B: tensor<30x20x40xf32>,
-                                       %C: tensor<10x40xf32>) -> tensor<10x40xf32> {
+func.func @op_multi_reduction(%A: tensor<10x20x30xf32>,
+                              %B: tensor<30x20x40xf32>,
+                              %C: tensor<10x40xf32>) -> tensor<10x40xf32> {
   %0 = linalg.generic
            {indexing_maps = [#mapA, #mapB, #mapC],
             iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
@@ -381,10 +381,10 @@ func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
 }
 
 // Cannot be lifted to named matrix multiply.
-// CHECK-LABEL: negative_op_multi_reduction
+// CHECK-LABEL: op_multi_reduction
 // CHECK: linalg.generic
 
-// CATEGORY-LABEL: negative_op_multi_reduction
+// CATEGORY-LABEL: op_multi_reduction
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 
@@ -395,8 +395,8 @@ func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
 #mapBni0 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
 #mapBni1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #mapBni2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8x16xf32>,
-                                                     %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
+func.func @batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8x16xf32>,
+                                           %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
            {indexing_maps = [#mapBni0, #mapBni1, #mapBni2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
            ins(%A, %B : tensor<4x2x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x4x16xf32>) {
@@ -409,10 +409,10 @@ func.func @negative_batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: t
 }
 
 // Cannot be lifted to named matrix multiply.
-// CHECK-LABEL: negative_batch_matmul_non_identity_batch
+// CHECK-LABEL: batch_matmul_non_identity_batch
 // CHECK: linalg.generic
 
-// CATEGORY-LABEL: negative_batch_matmul_non_identity_batch
+// CATEGORY-LABEL: batch_matmul_non_identity_batch
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 

>From 2b358ec3f5e6247099074eab16bfe7015ad58846 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Thu, 5 Mar 2026 10:55:15 +0100
Subject: [PATCH 06/13] Roundtrip test

---
 .../Linalg/roundtrip-linalg-category-ops.mlir | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir

diff --git a/mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir
new file mode 100644
index 0000000000000..bfecf28a33c70
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir
@@ -0,0 +1,101 @@
+// The following test examples of linalg category ops lowered to linalg.generic
+// and then lifted back up to category op.
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=category-to-generic \
+// RUN: | mlir-opt -split-input-file -linalg-morph-ops=generic-to-category \
+// RUN: | FileCheck %s
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @contract_matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
+    %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL: contract_matmul
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+
+func.func @contract_matmul_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+    %arg2: memref<?x?xf32>) {
+  linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
+    outs(%arg2 : memref<?x?xf32>)
+  return
+}
+
+// CHECK-LABEL: contract_matmul_memref
+// CHECK-SAME: %[[A:.+]]: memref<?x?xf32>, %[[B:.+]]: memref<?x?xf32>,
+// CHECK-SAME: %[[Out:.+]]: memref<?x?xf32>)
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-SAME: ins(%[[A]], %[[B]] : memref<?x?xf32>, memref<?x?xf32>)
+// CHECK-SAME: outs(%[[Out]] : memref<?x?xf32>)
+
+func.func @contract_matmul_bitcast_int_to_float(%arg0: tensor<16x8xi32>,
+    %arg1: tensor<8x32xi32>, %arg2: tensor<16x32xf32>) -> tensor<16x32xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : tensor<16x8xi32>, tensor<8x32xi32>)
+    outs(%arg2 : tensor<16x32xf32>) -> tensor<16x32xf32>
+  return %0 : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: contract_matmul_bitcast_int_to_float
+// CHECK-SAME: %[[A:.+]]: tensor<16x8xi32>, %[[B:.+]]: tensor<8x32xi32>,
+// CHECK-SAME: %[[Out:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-NOT: cast =
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xi32>, tensor<8x32xi32>)
+// CHECK-SAME: outs(%[[Out]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+
+func.func @contract_matmul_unsigned_cast_float(%arg0: tensor<16x8xi16>,
+    %arg1: tensor<8x32xi16>, %arg2: tensor<16x32xf32>) -> tensor<16x32xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    {cast = #linalg.type_fn<cast_unsigned>}
+    ins(%arg0, %arg1 : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%arg2 : tensor<16x32xf32>) -> tensor<16x32xf32>
+  return %0 : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: contract_matmul_unsigned_cast_float
+// CHECK-SAME: %[[A:.+]]: tensor<16x8xi16>, %[[B:.+]]: tensor<8x32xi16>,
+// CHECK-SAME: %[[Out:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-SAME: cast = #linalg.type_fn<cast_unsigned>
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xi16>, tensor<8x32xi16>)
+// CHECK-SAME: outs(%[[Out]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2, d1)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+func.func @contract_multi_reduction(%arg0: tensor<10x20x30xf32>,
+    %arg1: tensor<30x20x40xf32>, %arg2: tensor<10x40xf32>) -> tensor<10x40xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
+    outs(%arg2 : tensor<10x40xf32>) -> tensor<10x40xf32>
+  return %0 : tensor<10x40xf32>
+}
+
+// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2, d1)>
+// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+
+// CHECK-LABEL: contract_multi_reduction
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}

>From 2349534171fc6d342c7adaf0569177f770c5b8b2 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 11:51:11 +0100
Subject: [PATCH 07/13] Fix after rebase

---
 mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index 49ce621d6d36f..e5c0b979369aa 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -152,8 +152,12 @@ static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op,
 
   // Set the original generic's maps to preserve operand indexing semantics like
   // transposition.
+  SmallVector<Attribute, 3> indexingMapsAttrVal =
+      llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
+        return AffineMapAttr::get(map);
+      });
   auto indexingMapsAttr = rewriter.getNamedAttr(
-      "indexing_maps", rewriter.getArrayAttr(indexingMaps));
+      "indexing_maps", rewriter.getArrayAttr(indexingMapsAttrVal));
   attributes.push_back(indexingMapsAttr);
 
   LinalgOp namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(

>From 70cb93bb036c8ab9f5abd2fe88da597d9dfab5d4 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 13:55:04 +0100
Subject: [PATCH 08/13] Refactor and expand specialize test

---
 .../Linalg/specialize-generic-ops.mlir        | 515 ++++++++++--------
 1 file changed, 297 insertions(+), 218 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index 2e06f4987f7bc..33b360b6c9ef5 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -1,5 +1,8 @@
-// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-named | FileCheck %s
-// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-category | FileCheck %s --check-prefix=CATEGORY
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-named \
+// RUN: | FileCheck %s --check-prefix=NAMED,ALL
+
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-category \
+// RUN: | FileCheck %s --check-prefix=CATEGORY,ALL
 
 #umap = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
@@ -13,13 +16,15 @@ func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tenso
   return %0 : tensor<?x?x?xf32>
 }
 
-// CHECK-LABEL: unary_op_exp
-// CHECK-SAME: %[[A:.+]]: tensor<?x?x?xf32>, %[[Out:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.exp ins(%[[A]] : tensor<?x?x?xf32>) outs(%[[Out]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// ALL-LABEL: unary_op_exp
+// ALL-SAME: %[[A:.+]]: tensor<?x?x?xf32>, %[[OUT:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.exp
+// NAMED-SAME: ins(%[[A]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 
 // Not supported yet.
-// CATEGORY-LABEL: unary_op_exp
 // CATEGORY: linalg.generic
 
 // -----
@@ -36,13 +41,16 @@ func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<
   return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: binary_op_div
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.div ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-LABEL: binary_op_div
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// ALL-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.div
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // Not supported yet.
-// CATEGORY-LABEL: binary_op_div
 // CATEGORY: linalg.generic
 
 // -----
@@ -66,21 +74,24 @@ func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?x
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: op_matmul
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
 // CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 // CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 // CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 
-// CATEGORY-LABEL: op_matmul
-// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-LABEL: op_matmul
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// ALL-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // CATEGORY-NOT: linalg.generic
-// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // Cast-auditing tests: ensure we only specialize when the cast semantics can
 // be expressed by linalg.matmul, and use the cast attribute when needed.
@@ -100,13 +111,14 @@ func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
   return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: op_matmul_unsigned_cast
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_matmul_unsigned_cast
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 
-// CATEGORY-LABEL: op_matmul_unsigned_cast
 // CATEGORY-NOT: linalg.generic
-// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
 
 // Ensures truncation rounding is tolerated with unsigned cases.
@@ -131,13 +143,14 @@ func.func @op_matmul_unsigned_cast_and_truncate(%A: tensor<16x8xi16>, %B: tensor
   return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: op_matmul_unsigned_cast_and_truncate
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_matmul_unsigned_cast_and_truncate
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 
-// CATEGORY-LABEL: op_matmul_unsigned_cast_and_truncate
 // CATEGORY-NOT: linalg.generic
-// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
 
 // Signed casts are the default, no cast attribute is required.
@@ -156,15 +169,18 @@ func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
    return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: op_matmul_signed_cast
-// CHECK-NOT: linalg.generic
-// CHECK-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-// CHECK: linalg.matmul
+// ALL-LABEL: op_matmul_signed_cast
+
+// NAMED-NOT: linalg.generic
+// NAMED-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// NAMED: linalg.matmul
 
-// CATEGORY-LABEL: op_matmul_signed_cast
 // CATEGORY-NOT: linalg.generic
-// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY: %[[RES:.+]] = linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CATEGORY-NOT: {cast =
+// CATEGORY-SAME: ins
+// CATEGORY: return %[[RES]]
 
 // Mixed signed/unsigned inputs cannot be encoded with a single cast attribute.
 func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
@@ -182,11 +198,11 @@ func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi
    return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: negative_op_matmul_mixed_cast
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.matmul
+// ALL-LABEL: negative_op_matmul_mixed_cast
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.matmul
 
-// CATEGORY-LABEL: negative_op_matmul_mixed_cast
 // CATEGORY: linalg.generic
 // CATEGORY-NOT: linalg.contract
 
@@ -206,11 +222,11 @@ func.func @negative_op_matmul_output_cast(%A: tensor<16x8xi32>, %B: tensor<8x32x
    return %0 : tensor<16x32xi64>
 }
 
-// CHECK-LABEL: negative_op_matmul_output_cast
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.matmul
+// ALL-LABEL: negative_op_matmul_output_cast
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.matmul
 
-// CATEGORY-LABEL: negative_op_matmul_output_cast
 // CATEGORY: linalg.generic
 // CATEGORY-NOT: linalg.contract
 
@@ -235,11 +251,11 @@ func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
   return %0 : tensor<16x32xf32>
 }
 
-// CHECK-LABEL: op_matmul_bitcast_int_to_float
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
+// ALL-LABEL: op_matmul_bitcast_int_to_float
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
 
-// CATEGORY-LABEL: op_matmul_bitcast_int_to_float
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 
@@ -259,15 +275,16 @@ func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16
   return %0 : tensor<16x32xf32>
 }
 
-// CHECK-LABEL: op_matmul_signed_cast_float
-// CHECK-NOT: linalg.generic
-// CHECK-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-// CHECK: linalg.matmul
+// ALL-LABEL: op_matmul_signed_cast_float
+
+// NAMED-NOT: linalg.generic
+// NAMED-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// NAMED: linalg.matmul
 
-// CATEGORY-LABEL: op_matmul_signed_cast_float
 // CATEGORY-NOT: linalg.generic
 // CATEGORY-NOT: linalg.contract{{.*}}{cast =
-// CATEGORY: linalg.contract
+// CATEGORY: %[[RES:.+]] = linalg.contract
+// CATEGORY: return %[[RES]]
 
 // Unsigned float casts are expressed via uitofp and use the unsigned cast attr.
 func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
@@ -285,11 +302,11 @@ func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi
   return %0 : tensor<16x32xf32>
 }
 
-// CHECK-LABEL: op_matmul_unsigned_cast_float
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_matmul_unsigned_cast_float
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
 
-// CATEGORY-LABEL: op_matmul_unsigned_cast_float
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract{{.*}}{cast = #linalg.type_fn<cast_unsigned>}
 
@@ -314,20 +331,20 @@ func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>, %Out:
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: op_batch_matmul
-// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[Out:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
-
 // CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 // CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 // CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CATEGORY-LABEL: op_batch_matmul
-// CATEGORY-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[Out:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// ALL-LABEL: op_batch_matmul
+// ALL-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[OUT:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
 // Ensure that the unsigned cast path for cast detection is exercised for
 // batch_matmul as well.
@@ -349,11 +366,11 @@ func.func @op_batch_matmul_unsigned_cast(%A: tensor<2x16x8xi16>,
    return %0 : tensor<2x16x16xi32>
 }
 
-// CHECK-LABEL: op_batch_matmul_unsigned_cast
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_batch_matmul_unsigned_cast
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul {cast = #linalg.type_fn<cast_unsigned>}
 
-// CATEGORY-LABEL: op_batch_matmul_unsigned_cast
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
@@ -380,11 +397,11 @@ func.func @op_multi_reduction(%A: tensor<10x20x30xf32>,
   return %0 : tensor<10x40xf32>
 }
 
+// ALL-LABEL: op_multi_reduction
+
 // Cannot be lifted to named matrix multiply.
-// CHECK-LABEL: op_multi_reduction
-// CHECK: linalg.generic
+// NAMED: linalg.generic
 
-// CATEGORY-LABEL: op_multi_reduction
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 
@@ -408,11 +425,11 @@ func.func @batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8
   return %0 : tensor<2x4x16xf32>
 }
 
+// ALL-LABEL: batch_matmul_non_identity_batch
+
 // Cannot be lifted to named matrix multiply.
-// CHECK-LABEL: batch_matmul_non_identity_batch
-// CHECK: linalg.generic
+// NAMED: linalg.generic
 
-// CATEGORY-LABEL: batch_matmul_non_identity_batch
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 
@@ -433,10 +450,11 @@ func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>
   } -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: op_matvec
-// CHECK: linalg.generic
 
-// CATEGORY-LABEL: op_matvec
+// ALL-LABEL: op_matvec
+
+// NAMED: linalg.generic
+
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 
@@ -458,20 +476,22 @@ func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_transpose_a
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// CATEGORY-LABEL: op_matmul_transpose_a
+// ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// ALL-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul_transpose_a
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
 
 // -----
 
@@ -491,27 +511,24 @@ func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// ALL-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
-// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CATEGORY-LABEL: op_matmul_transpose_b
-// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 // CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -531,27 +548,24 @@ func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16x
   return %0 : tensor<2x4x16xf32>
 }
 
-// CHECK-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
-// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CHECK-LABEL: op_batch_matmul_transpose_a
-// CHECK-SAME: %[[A:.+]]: tensor<2x8x4xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
-
-// CATEGORY-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
-// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
-// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CATEGORY-LABEL: op_batch_matmul_transpose_a
-// CATEGORY-SAME: %[[A:.+]]: tensor<2x8x4xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
+// ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
+// ALL-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// ALL-LABEL: op_batch_matmul_transpose_a
+// ALL-SAME: %[[A:.+]]: tensor<2x8x4xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[OUT:.+]]: tensor<2x4x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 // CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
 // -----
 
@@ -571,27 +585,24 @@ func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8x
   return %0 : tensor<2x4x16xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CHECK-LABEL: op_batch_matmul_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<2x4x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+// ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// ALL-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// ALL-LABEL: op_batch_matmul_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<2x4x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[OUT:.+]]: tensor<2x4x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
-// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CATEGORY-LABEL: op_batch_matmul_transpose_b
-// CATEGORY-SAME: %[[A:.+]]: tensor<2x4x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 // CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
 // -----
 
@@ -612,27 +623,24 @@ func.func @op_matmul_transpose_a_and_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_transpose_a_and_b
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// CATEGORY-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CATEGORY-LABEL: op_matmul_transpose_a_and_b
-// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// ALL-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul_transpose_a_and_b
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 // CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -653,27 +661,24 @@ func.func @op_matmul_transposed_output(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK-DAG: #[[$MAP_TC:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
-// CHECK-LABEL: op_matmul_transposed_output
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// ALL-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// ALL-DAG: #[[$MAP_TC:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+
+// ALL-LABEL: op_matmul_transposed_output
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
-// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CATEGORY-DAG: #[[$MAP_TC:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
-// CATEGORY-LABEL: op_matmul_transposed_output
-// CATEGORY-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
 // CATEGORY-NOT: linalg.generic
 // CATEGORY: linalg.contract
 // CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
 // CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CATEGORY-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -695,11 +700,23 @@ func.func @op_matmul_non_canonical_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: op_matmul_non_canonical_loops
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+// ALL-LABEL: op_matmul_non_canonical_loops
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 // -----
 
 // Batch matmul with non-canonical loop ordering.
@@ -720,10 +737,23 @@ func.func @op_batch_matmul_non_canonical_loops(%A: tensor<2x16x8xf32>, %B: tenso
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: op_batch_matmul_non_canonical_loops
-// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[Out:.+]]: tensor<2x16x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+
+// ALL-LABEL: op_batch_matmul_non_canonical_loops
+// ALL-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[OUT:.+]]: tensor<2x16x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
 // -----
 
@@ -745,16 +775,28 @@ func.func @op_matmul_non_canonical_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_non_canonical_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// NAMED-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// NAMED-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// NAMED-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+// ALL-LABEL: op_matmul_non_canonical_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -777,16 +819,28 @@ func.func @op_batch_matmul_non_canonical_transpose_b(%A: tensor<2x16x8xf32>, %B:
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CHECK-LABEL: op_batch_matmul_non_canonical_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[Out:.+]]: tensor<2x16x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// NAMED-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// NAMED-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// NAMED-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+
+// ALL-LABEL: op_batch_matmul_non_canonical_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[OUT:.+]]: tensor<2x16x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
 // -----
 
@@ -808,10 +862,23 @@ func.func @op_matmul_fully_shuffled_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf3
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: op_matmul_fully_shuffled_loops
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+
+// ALL-LABEL: op_matmul_fully_shuffled_loops
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -819,7 +886,7 @@ func.func @op_matmul_fully_shuffled_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf3
 #map_bcast_a = affine_map<(d0, d1, d2) -> (d2)>
 #map_bcast_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_bcast_c = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @negative_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
+func.func @op_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
                                         %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
          {indexing_maps = [#map_bcast_a, #map_bcast_b, #map_bcast_c],
@@ -833,9 +900,13 @@ func.func @negative_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
    return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: negative_matmul_broadcast_a
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.matmul
+// ALL-LABEL: op_matmul_broadcast_a
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
@@ -843,7 +914,7 @@ func.func @negative_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
 #map_bbcast_a = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
 #map_bbcast_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_bbcast_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x8x16xf32>,
+func.func @op_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x8x16xf32>,
                                               %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
            {indexing_maps = [#map_bbcast_a, #map_bbcast_b, #map_bbcast_c],
@@ -857,9 +928,13 @@ func.func @negative_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: negative_batch_matmul_broadcast_a
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.batch_matmul
+// ALL-LABEL: op_batch_matmul_broadcast_a
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.batch_matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
@@ -867,7 +942,7 @@ func.func @negative_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x
 #map_bbcast2_a = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map_bbcast2_b = affine_map<(d0, d1, d2, d3) -> (d3)>
 #map_bbcast2_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<8xf32>,
+func.func @op_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<8xf32>,
                                               %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
            {indexing_maps = [#map_bbcast2_a, #map_bbcast2_b, #map_bbcast2_c],
@@ -881,6 +956,10 @@ func.func @negative_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: negative_batch_matmul_broadcast_b
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.batch_matmul
+// ALL-LABEL: op_batch_matmul_broadcast_b
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.batch_matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract

>From 39ddc5b61101b23f38f242b25c4ace9b28bea6c5 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 14:08:14 +0100
Subject: [PATCH 09/13] Rename and refactor roundtrip morph tests

---
 .../Linalg/roundtrip-linalg-named-ops.mlir    | 72 --------------
 ...undtrip-morphism-linalg-category-ops.mlir} | 32 +++---
 .../roundtrip-morphism-linalg-named-ops.mlir  | 99 +++++++++++++++++++
 3 files changed, 118 insertions(+), 85 deletions(-)
 delete mode 100644 mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
 rename mlir/test/Dialect/Linalg/{roundtrip-linalg-category-ops.mlir => roundtrip-morphism-linalg-category-ops.mlir} (78%)
 create mode 100644 mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir

diff --git a/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
deleted file mode 100644
index f15ae646e5765..0000000000000
--- a/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
+++ /dev/null
@@ -1,72 +0,0 @@
-// The following test examples of linalg named ops lowered to linalg.generic and then
-// lifted back up to named op.
-// RUN: mlir-opt %s -linalg-generalize-named-ops | mlir-opt --linalg-specialize-generic-ops | FileCheck %s
-
-func.func @unary_exp(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
-  linalg.exp ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
-  return
-}
-
-// CHECK-LABEL: unary_exp
-// CHECK-SAME: %[[A:.+]]: memref<7x14x21xf32>, %[[Out:.+]]: memref<7x14x21xf32>)
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.exp ins(%[[A]] : memref<7x14x21xf32>) outs(%[[Out]] : memref<7x14x21xf32>)
-
-// -----
-
-func.func @binary_add(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = linalg.add ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: binary_add
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.add ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// -----
-
-
-///----------------------------------------------------------------------------------------
-/// Tests for linalg.matmul
-///----------------------------------------------------------------------------------------
-
-func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @matmul
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// -----
-
-// Check matmul with unsigned cast is correctly raised back to named op.
-func.func @matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
-                                %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
-  %0 = linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-                     ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>)
-                     outs(%Out : tensor<16x32xi32>) -> tensor<16x32xi32>
-  return %0 : tensor<16x32xi32>
-}
-
-// CHECK-LABEL: @matmul_unsigned_cast
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-
-// -----
-
-func.func @mixed_named_ops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                   %C: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %AB = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %1 = linalg.add ins(%AB, %C : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @mixed_named_ops
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[C:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: %[[AB:.+]] = linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK: linalg.add ins(%[[AB]], %[[C]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir
similarity index 78%
rename from mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir
rename to mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir
index bfecf28a33c70..d5e49a866eaec 100644
--- a/mlir/test/Dialect/Linalg/roundtrip-linalg-category-ops.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir
@@ -1,5 +1,6 @@
 // The following test examples of linalg category ops lowered to linalg.generic
 // and then lifted back up to category op.
+
 // RUN: mlir-opt %s -split-input-file -linalg-morph-ops=category-to-generic \
 // RUN: | mlir-opt -split-input-file -linalg-morph-ops=generic-to-category \
 // RUN: | FileCheck %s
@@ -21,11 +22,12 @@ func.func @contract_matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
 
 // CHECK-LABEL: contract_matmul
 // CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
-// CHECK-SAME: %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
 // CHECK-NOT: linalg.generic
-// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 
 func.func @contract_matmul_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
@@ -38,11 +40,12 @@ func.func @contract_matmul_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>
 
 // CHECK-LABEL: contract_matmul_memref
 // CHECK-SAME: %[[A:.+]]: memref<?x?xf32>, %[[B:.+]]: memref<?x?xf32>,
-// CHECK-SAME: %[[Out:.+]]: memref<?x?xf32>)
+// CHECK-SAME: %[[OUT:.+]]: memref<?x?xf32>)
 // CHECK-NOT: linalg.generic
-// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CHECK-SAME: ins(%[[A]], %[[B]] : memref<?x?xf32>, memref<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : memref<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<?x?xf32>)
 
 func.func @contract_matmul_bitcast_int_to_float(%arg0: tensor<16x8xi32>,
     %arg1: tensor<8x32xi32>, %arg2: tensor<16x32xf32>) -> tensor<16x32xf32> {
@@ -54,12 +57,13 @@ func.func @contract_matmul_bitcast_int_to_float(%arg0: tensor<16x8xi32>,
 
 // CHECK-LABEL: contract_matmul_bitcast_int_to_float
 // CHECK-SAME: %[[A:.+]]: tensor<16x8xi32>, %[[B:.+]]: tensor<8x32xi32>,
-// CHECK-SAME: %[[Out:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-SAME: %[[OUT:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
 // CHECK-NOT: linalg.generic
-// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CHECK-NOT: cast =
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xi32>, tensor<8x32xi32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-SAME: outs(%[[OUT]] : tensor<16x32xf32>) -> tensor<16x32xf32>
 
 func.func @contract_matmul_unsigned_cast_float(%arg0: tensor<16x8xi16>,
     %arg1: tensor<8x32xi16>, %arg2: tensor<16x32xf32>) -> tensor<16x32xf32> {
@@ -72,12 +76,13 @@ func.func @contract_matmul_unsigned_cast_float(%arg0: tensor<16x8xi16>,
 
 // CHECK-LABEL: contract_matmul_unsigned_cast_float
 // CHECK-SAME: %[[A:.+]]: tensor<16x8xi16>, %[[B:.+]]: tensor<8x32xi16>,
-// CHECK-SAME: %[[Out:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-SAME: %[[OUT:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
 // CHECK-NOT: linalg.generic
-// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
 // CHECK-SAME: cast = #linalg.type_fn<cast_unsigned>
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xi16>, tensor<8x32xi16>)
-// CHECK-SAME: outs(%[[Out]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-SAME: outs(%[[OUT]] : tensor<16x32xf32>) -> tensor<16x32xf32>
 
 // -----
 
@@ -98,4 +103,5 @@ func.func @contract_multi_reduction(%arg0: tensor<10x20x30xf32>,
 
 // CHECK-LABEL: contract_multi_reduction
 // CHECK-NOT: linalg.generic
-// CHECK: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
diff --git a/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
new file mode 100644
index 0000000000000..19b30ef10da84
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
@@ -0,0 +1,99 @@
+// The following test examples of linalg named ops lowered to linalg.generic
+// and then lifted back up to named op.
+
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=named-to-generic \
+// RUN: | mlir-opt -split-input-file -linalg-morph-ops=generic-to-named \
+// RUN: | FileCheck %s
+
+func.func @unary_exp(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
+  linalg.exp ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK-LABEL: unary_exp
+// CHECK-SAME: %[[A:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.exp
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// -----
+
+func.func @binary_add(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.add
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: binary_add
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.add
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matmul
+///----------------------------------------------------------------------------------------
+
+func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @matmul
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+// Check matmul with unsigned cast is correctly raised back to named op.
+func.func @matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
+    %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+  %0 = linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>)
+    outs(%Out : tensor<16x32xi32>) -> tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
+}
+
+// CHECK-LABEL: @matmul_unsigned_cast
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul
+// CHECK-SAME: {cast = #linalg.type_fn<cast_unsigned>}
+
+// -----
+
+func.func @mixed_named_ops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %C: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %AB = linalg.matmul
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.add
+    ins(%AB, %C : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @mixed_named_ops
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[C:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: %[[AB:.+]] = linalg.matmul
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK: linalg.add
+// CHECK-SAME: ins(%[[AB]], %[[C]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>

>From 0cd003b676d41141274949d22c99b74609fc7dd7 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 14:17:50 +0100
Subject: [PATCH 10/13] Refactor specialize test layout

---
 .../Linalg/specialize-generic-ops.mlir        | 413 ++++++++++--------
 1 file changed, 232 insertions(+), 181 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index 33b360b6c9ef5..17f1d0eea0512 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -7,8 +7,10 @@
 #umap = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = linalg.generic
-          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
-          ins(%A : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+    {indexing_maps = [#umap, #umap],
+    iterator_types = ["parallel", "parallel","parallel"]}
+    ins(%A : tensor<?x?x?xf32>)
+    outs(%Out : tensor<?x?x?xf32>) {
   ^bb0(%in: f32, %out: f32):
     %1 = math.exp %in : f32
     linalg.yield %1 : f32
@@ -30,10 +32,13 @@ func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tenso
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                         %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+    {indexing_maps = [#map, #map, #map],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.divf %in, %in_0 : f32
     linalg.yield %1 : f32
@@ -62,16 +67,19 @@ func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                     %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
@@ -100,8 +108,10 @@ func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?x
 func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
                                    %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi32>) outs(%Out : tensor<16x32xi32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi32>)
+    outs(%Out : tensor<16x32xi32>) {
   ^bb0(%in: i16, %in_0: i32, %out: i32):
     %1 = arith.extui %in : i16 to i32
     %3 = arith.muli %1, %in_0 : i32
@@ -131,8 +141,10 @@ func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
 func.func @op_matmul_unsigned_cast_and_truncate(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
                                                 %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>) outs(%Out : tensor<16x32xi32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>)
+    outs(%Out : tensor<16x32xi32>) {
   ^bb0(%in: i16, %in_0: i64, %out: i32):
     %1 = arith.extui %in : i16 to i32
     %2 = arith.trunci %in_0 : i64 to i32
@@ -157,16 +169,18 @@ func.func @op_matmul_unsigned_cast_and_truncate(%A: tensor<16x8xi16>, %B: tensor
 func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                  %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi32>) {
-   ^bb0(%in: i16, %in_0: i16, %out: i32):
-     %1 = arith.extsi %in : i16 to i32
-     %2 = arith.extsi %in_0 : i16 to i32
-     %3 = arith.muli %1, %2 : i32
-     %4 = arith.addi %out, %3 : i32
-     linalg.yield %4 : i32
-   } -> tensor<16x32xi32>
-   return %0 : tensor<16x32xi32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xi32>) {
+  ^bb0(%in: i16, %in_0: i16, %out: i32):
+    %1 = arith.extsi %in : i16 to i32
+    %2 = arith.extsi %in_0 : i16 to i32
+    %3 = arith.muli %1, %2 : i32
+    %4 = arith.addi %out, %3 : i32
+    linalg.yield %4 : i32
+  } -> tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
 }
 
 // ALL-LABEL: op_matmul_signed_cast
@@ -184,18 +198,20 @@ func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
 
 // Mixed signed/unsigned inputs cannot be encoded with a single cast attribute.
 func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
-                                %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+                                         %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi32>) {
-   ^bb0(%in: i16, %in_0: i16, %out: i32):
-     %1 = arith.extui %in : i16 to i32
-     %2 = arith.extsi %in_0 : i16 to i32
-     %3 = arith.muli %1, %2 : i32
-     %4 = arith.addi %out, %3 : i32
-     linalg.yield %4 : i32
-   } -> tensor<16x32xi32>
-   return %0 : tensor<16x32xi32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xi32>) {
+  ^bb0(%in: i16, %in_0: i16, %out: i32):
+    %1 = arith.extui %in : i16 to i32
+    %2 = arith.extsi %in_0 : i16 to i32
+    %3 = arith.muli %1, %2 : i32
+    %4 = arith.addi %out, %3 : i32
+    linalg.yield %4 : i32
+  } -> tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
 }
 
 // ALL-LABEL: negative_op_matmul_mixed_cast
@@ -208,18 +224,20 @@ func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi
 
 // Output-side casts are not representable by the named matmul ops.
 func.func @negative_op_matmul_output_cast(%A: tensor<16x8xi32>, %B: tensor<8x32xi32>,
-                                 %Out: tensor<16x32xi64>) -> tensor<16x32xi64> {
+                                          %Out: tensor<16x32xi64>) -> tensor<16x32xi64> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>) outs(%Out : tensor<16x32xi64>) {
-   ^bb0(%in: i32, %in_0: i32, %out: i64):
-     %3 = arith.trunci %out : i64 to i32
-     %4 = arith.muli %in, %in_0 : i32
-     %5 = arith.addi %3, %4 : i32
-     %6 = arith.extsi %5 : i32 to i64
-     linalg.yield %6 : i64
-   } -> tensor<16x32xi64>
-   return %0 : tensor<16x32xi64>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>)
+    outs(%Out : tensor<16x32xi64>) {
+  ^bb0(%in: i32, %in_0: i32, %out: i64):
+    %3 = arith.trunci %out : i64 to i32
+    %4 = arith.muli %in, %in_0 : i32
+    %5 = arith.addi %3, %4 : i32
+    %6 = arith.extsi %5 : i32 to i64
+    linalg.yield %6 : i64
+  } -> tensor<16x32xi64>
+  return %0 : tensor<16x32xi64>
 }
 
 // ALL-LABEL: negative_op_matmul_output_cast
@@ -239,8 +257,10 @@ func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
                                           %B: tensor<8x32xi32>,
                                           %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>) outs(%Out : tensor<16x32xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>)
+    outs(%Out : tensor<16x32xf32>) {
   ^bb0(%in: i32, %in_0: i32, %out: f32):
     %1 = arith.bitcast %in : i32 to f32
     %2 = arith.bitcast %in_0 : i32 to f32
@@ -263,8 +283,10 @@ func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
 func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                        %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xf32>) {
   ^bb0(%in: i16, %in_0: i16, %out: f32):
     %1 = arith.sitofp %in : i16 to f32
     %2 = arith.sitofp %in_0 : i16 to f32
@@ -290,8 +312,10 @@ func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16
 func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                          %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xf32>) {
   ^bb0(%in: i16, %in_0: i16, %out: f32):
     %1 = arith.uitofp %in : i16 to f32
     %2 = arith.uitofp %in_0 : i16 to f32
@@ -319,10 +343,13 @@ func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>, %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>,
+                           %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -352,18 +379,18 @@ func.func @op_batch_matmul_unsigned_cast(%A: tensor<2x16x8xi16>,
                                          %B: tensor<2x8x16xi64>,
                                          %Out: tensor<2x16x16xi32>) -> tensor<2x16x16xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2],
-          iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<2x16x8xi16>, tensor<2x8x16xi64>)
-         outs(%Out : tensor<2x16x16xi32>) {
-   ^bb0(%in: i16, %in_0: i64, %out: i32):
-     %1 = arith.extui %in : i16 to i32
-     %2 = arith.trunci %in_0 : i64 to i32
-     %3 = arith.muli %1, %2 : i32
-     %4 = arith.addi %out, %3 : i32
-     linalg.yield %4 : i32
-   } -> tensor<2x16x16xi32>
-   return %0 : tensor<2x16x16xi32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x16x8xi16>, tensor<2x8x16xi64>)
+    outs(%Out : tensor<2x16x16xi32>) {
+  ^bb0(%in: i16, %in_0: i64, %out: i32):
+    %1 = arith.extui %in : i16 to i32
+    %2 = arith.trunci %in_0 : i64 to i32
+    %3 = arith.muli %1, %2 : i32
+    %4 = arith.addi %out, %3 : i32
+    linalg.yield %4 : i32
+  } -> tensor<2x16x16xi32>
+  return %0 : tensor<2x16x16xi32>
 }
 
 // ALL-LABEL: op_batch_matmul_unsigned_cast
@@ -385,10 +412,10 @@ func.func @op_multi_reduction(%A: tensor<10x20x30xf32>,
                               %B: tensor<30x20x40xf32>,
                               %C: tensor<10x40xf32>) -> tensor<10x40xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#mapA, #mapB, #mapC],
-            iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
-  ins(%A, %B : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
-  outs(%C : tensor<10x40xf32>) {
+    {indexing_maps = [#mapA, #mapB, #mapC],
+    iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
+    ins(%A, %B : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
+    outs(%C : tensor<10x40xf32>) {
   ^bb0(%a: f32, %b: f32, %c: f32):
     %1 = arith.mulf %a, %b : f32
     %2 = arith.addf %c, %1 : f32
@@ -415,8 +442,10 @@ func.func @op_multi_reduction(%A: tensor<10x20x30xf32>,
 func.func @batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8x16xf32>,
                                            %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#mapBni0, #mapBni1, #mapBni2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<4x2x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x4x16xf32>) {
+    {indexing_maps = [#mapBni0, #mapBni1, #mapBni2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<4x2x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x4x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -441,12 +470,12 @@ func.func @batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8
 #map2 = affine_map<(d0, d1) -> (d0)>
 func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>) -> tensor<?xf32> {
   %0 = linalg.generic
-          {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction"]}
-          ins(%A, %B : tensor<?x?xf32>, tensor<?xf32>) outs(%Out : tensor<?xf32>) {
-      ^bb0(%in: f32, %in_0: f32, %out: f32):
-        %1 = arith.mulf %in, %in_0 : f32
-        %2 = arith.addf %out, %1 : f32
-        linalg.yield %2 : f32
+    {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?xf32>) outs(%Out : tensor<?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
   } -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -464,16 +493,18 @@ func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>
 #map_ta = affine_map<(d0, d1, d2) -> (d2, d0)>
 #map_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_c = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                                 %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_ta, #map_b, #map_c], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_ta, #map_b, #map_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
@@ -499,16 +530,19 @@ func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
 #map_a = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map_tb = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map_c = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                                 %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_a, #map_tb, #map_c], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_a, #map_tb, #map_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
@@ -536,10 +570,13 @@ func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
 #map_ta = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
 #map_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16xf32>, %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
+func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16xf32>,
+                                       %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_ta, #map_b, #map_c], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x8x4xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x4x16xf32>) {
+    {indexing_maps = [#map_ta, #map_b, #map_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x4x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -573,10 +610,13 @@ func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16x
 #map_a = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map_tb = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
 #map_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8xf32>, %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
+func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8xf32>,
+                                       %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_a, #map_tb, #map_c], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x4x8xf32>, tensor<2x16x8xf32>) outs(%Out : tensor<2x4x16xf32>) {
+    {indexing_maps = [#map_a, #map_tb, #map_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
+    outs(%Out : tensor<2x4x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -611,16 +651,18 @@ func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8x
 #map_tb = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map_c = affine_map<(d0, d1, d2) -> (d0, d1)>
 func.func @op_matmul_transpose_a_and_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                        %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                       %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_ta, #map_tb, #map_c], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_ta, #map_tb, #map_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
@@ -649,16 +691,18 @@ func.func @op_matmul_transpose_a_and_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
 #map_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_tc = affine_map<(d0, d1, d2) -> (d1, d0)>
 func.func @op_matmul_transposed_output(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                        %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                       %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_a, #map_b, #map_tc], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_a, #map_b, #map_tc],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
@@ -687,17 +731,18 @@ func.func @op_matmul_transposed_output(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
 #map_nc_b = affine_map<(m, k, n) -> (k, n)>
 #map_nc_c = affine_map<(m, k, n) -> (m, n)>
 func.func @op_matmul_non_canonical_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                            %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                         %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_nc_a, #map_nc_b, #map_nc_c],
-          iterator_types = ["parallel", "reduction", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_nc_a, #map_nc_b, #map_nc_c],
+    iterator_types = ["parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
@@ -724,11 +769,12 @@ func.func @op_matmul_non_canonical_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32
 #map_bnc_b = affine_map<(batch, m, k, n) -> (batch, k, n)>
 #map_bnc_c = affine_map<(batch, m, k, n) -> (batch, m, n)>
 func.func @op_batch_matmul_non_canonical_loops(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>,
-                                                  %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+                                               %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bnc_a, #map_bnc_b, #map_bnc_c],
-            iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bnc_a, #map_bnc_b, #map_bnc_c],
+    iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -762,17 +808,17 @@ func.func @op_batch_matmul_non_canonical_loops(%A: tensor<2x16x8xf32>, %B: tenso
 #map_nc_tb_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_nc_tb_c = affine_map<(d0, d1, d2) -> (d0, d2)>
 func.func @op_matmul_non_canonical_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                                %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                               %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_nc_tb_a, #map_nc_tb_b, #map_nc_tb_c],
-          iterator_types = ["parallel", "reduction", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_nc_tb_a, #map_nc_tb_b, #map_nc_tb_c],
+    iterator_types = ["parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // NAMED-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
@@ -806,11 +852,12 @@ func.func @op_matmul_non_canonical_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?
 #map_bnc_tb_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_bnc_tb_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 func.func @op_batch_matmul_non_canonical_transpose_b(%A: tensor<2x16x8xf32>, %B: tensor<2x16x8xf32>,
-                                                      %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+                                                     %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bnc_tb_a, #map_bnc_tb_b, #map_bnc_tb_c],
-            iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x16x8xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bnc_tb_a, #map_bnc_tb_b, #map_bnc_tb_c],
+    iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -849,17 +896,18 @@ func.func @op_batch_matmul_non_canonical_transpose_b(%A: tensor<2x16x8xf32>, %B:
 #map_fs_b = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map_fs_c = affine_map<(d0, d1, d2) -> (d1, d2)>
 func.func @op_matmul_fully_shuffled_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                           %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                          %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_fs_a, #map_fs_b, #map_fs_c],
-          iterator_types = ["reduction", "parallel", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_fs_a, #map_fs_b, #map_fs_c],
+    iterator_types = ["reduction", "parallel", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
@@ -887,17 +935,18 @@ func.func @op_matmul_fully_shuffled_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf3
 #map_bcast_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_bcast_c = affine_map<(d0, d1, d2) -> (d0, d1)>
 func.func @op_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
-                                        %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                 %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_bcast_a, #map_bcast_b, #map_bcast_c],
-          iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_bcast_a, #map_bcast_b, #map_bcast_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // ALL-LABEL: op_matmul_broadcast_a
@@ -915,11 +964,12 @@ func.func @op_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
 #map_bbcast_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_bbcast_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 func.func @op_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x8x16xf32>,
-                                              %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+                                       %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bbcast_a, #map_bbcast_b, #map_bbcast_c],
-            iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bbcast_a, #map_bbcast_b, #map_bbcast_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -943,11 +993,12 @@ func.func @op_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x8x16xf
 #map_bbcast2_b = affine_map<(d0, d1, d2, d3) -> (d3)>
 #map_bbcast2_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 func.func @op_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<8xf32>,
-                                              %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+                                       %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bbcast2_a, #map_bbcast2_b, #map_bbcast2_c],
-            iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<8xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bbcast2_a, #map_bbcast2_b, #map_bbcast2_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<8xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32

>From 46a09477ca6a972b772f3a1915e6846d828ddec6 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 14:20:12 +0100
Subject: [PATCH 11/13] SpecializationOptions -> GenericOpSpecializationOptions

---
 .../mlir/Dialect/Linalg/Transforms/Transforms.h   | 15 ++++++++-------
 mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp   |  2 +-
 mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp |  9 +++++----
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 2fc083bf7b871..dcb7f1f212207 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -923,7 +923,7 @@ FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
 FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
                                        LinalgOp linalgOp);
 
-struct SpecializationOptions {
+struct GenericOpSpecializationOptions {
   // Specialize generics to category ops (default: named ops).
   bool emitCategoryOps = false;
 };
@@ -931,7 +931,7 @@ struct SpecializationOptions {
 /// Replace the given GenericOp with a namedOp or categoryOp.
 FailureOr<LinalgOp>
 specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
-                    const SpecializationOptions &options = {});
+                    const GenericOpSpecializationOptions &options = {});
 
 /// Create a new buffer using the `allocationFn` provided. The size of this
 /// buffer is either the original subview size when 'useOriginalSubviewSize' is
@@ -1724,9 +1724,9 @@ struct LinalgGeneralizationPattern
 
 struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
 
-  LinalgSpecializationPattern(MLIRContext *context,
-                              const SpecializationOptions &options = {},
-                              PatternBenefit benefit = 1)
+  LinalgSpecializationPattern(
+      MLIRContext *context, const GenericOpSpecializationOptions &options = {},
+      PatternBenefit benefit = 1)
       : OpRewritePattern<GenericOp>(context, benefit), options(options) {}
 
   FailureOr<GenericOp>
@@ -1740,7 +1740,7 @@ struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
   }
 
 private:
-  SpecializationOptions options;
+  GenericOpSpecializationOptions options;
 };
 
 /// Vectorization pattern for memref::CopyOp.
@@ -1956,7 +1956,8 @@ void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns);
 ///     p(x) = an*x^n + ... + a1x + a0
 /// There is no equivalent named op to convert to. Many such cases exist.
 void populateLinalgGenericOpsSpecializationPatterns(
-    RewritePatternSet &patterns, const SpecializationOptions &options = {});
+    RewritePatternSet &patterns,
+    const GenericOpSpecializationOptions &options = {});
 
 /// Populates `patterns` that convert linalg named ops e.g. `linalg.add`
 /// to equivalent `linalg.elementwise`.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
index 17416b42c47ab..fee293647deda 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
@@ -51,7 +51,7 @@ void LinalgMorphOpsPass::runOnOperation() {
 
   // Lifting paths (named <- category <- generic)
   if (genericToNamed || genericToCategory) {
-    SpecializationOptions opts;
+    GenericOpSpecializationOptions opts;
     opts.emitCategoryOps = genericToCategory;
     populateLinalgGenericOpsSpecializationPatterns(patterns, opts);
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index e5c0b979369aa..7cd7692ff3a6e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -450,9 +450,9 @@ static FailureOr<LinalgOp> specializeLinalgConvolutions(RewriterBase &rewriter,
 //===----------------------------------------------------------------------===//
 // Categorize linalg generic to named op where possible.
 //===----------------------------------------------------------------------===//
-FailureOr<LinalgOp>
-mlir::linalg::specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
-                                  const SpecializationOptions &options) {
+FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(
+    RewriterBase &rewriter, GenericOp genericOp,
+    const GenericOpSpecializationOptions &options) {
   // Contraction - e.g. matmul
   if (isaContractionOpInterface(genericOp)) {
     return specializeLinalgContractions(rewriter, genericOp,
@@ -560,6 +560,7 @@ void LinalgSpecializeGenericOpsPass::runOnOperation() {
 }
 
 void mlir::linalg::populateLinalgGenericOpsSpecializationPatterns(
-    RewritePatternSet &patterns, const SpecializationOptions &options) {
+    RewritePatternSet &patterns,
+    const GenericOpSpecializationOptions &options) {
   patterns.add<LinalgSpecializationPattern>(patterns.getContext(), options);
 }

>From 3d2c8e522f6746fcbf84b01f9b2d25450f7e3275 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 14:27:42 +0100
Subject: [PATCH 12/13] Refactor linalg builder helpers

---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 32 ++++++++++++------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 67d7406987569..37b549a7fcd7f 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -193,17 +193,17 @@ static void buildMatmulOp(OpBuilder &b, OperationState &state,
                           ValueRange inputs, ValueRange outputs,
                           ArrayRef<NamedAttribute> attributes,
                           RegionBuilderFn regionBuilder,
-                          ArrayRef<AffineMap> indexingMaps) {
-  // Initialize indexingMaps attribute, for MatmulOp.
-  SmallVector<Attribute, 3> indexingMapsAttrVal;
-  indexingMapsAttrVal =
-      llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
-        return AffineMapAttr::get(map);
-      });
+                          ArrayRef<AffineMap> defaultIndexingMaps) {
+  // If indexing maps are not provided, apply the default ones.
   if (none_of(attributes, [](NamedAttribute attr) {
         return attr.getName() == "indexing_maps";
-      }))
+      })) {
+    SmallVector<Attribute, 3> indexingMapsAttrVal;
+    indexingMapsAttrVal = llvm::map_to_vector(
+        defaultIndexingMaps,
+        [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
     state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  }
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }
@@ -213,17 +213,17 @@ static void buildBatchMatmulOp(OpBuilder &b, OperationState &state,
                                ValueRange inputs, ValueRange outputs,
                                ArrayRef<NamedAttribute> attributes,
                                RegionBuilderFn regionBuilder,
-                               ArrayRef<AffineMap> indexingMaps) {
-  // Initialize indexingMaps attribute, for BatchMatmulOp.
-  SmallVector<Attribute, 4> indexingMapsAttrVal;
-  indexingMapsAttrVal =
-      llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
-        return AffineMapAttr::get(map);
-      });
+                               ArrayRef<AffineMap> defaultIndexingMaps) {
+  // If indexing maps are not provided, apply the default ones.
   if (none_of(attributes, [](NamedAttribute attr) {
         return attr.getName() == "indexing_maps";
-      }))
+      })) {
+    SmallVector<Attribute, 4> indexingMapsAttrVal;
+    indexingMapsAttrVal = llvm::map_to_vector(
+        defaultIndexingMaps,
+        [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
     state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  }
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }

>From 674bc0883dbebee373d2e2eaf8e9d8da31798eee Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk at intel.com>
Date: Fri, 6 Mar 2026 14:32:20 +0100
Subject: [PATCH 13/13] Improve specialize docs

---
 mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index 7cd7692ff3a6e..ee9fc77961bab 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -230,6 +230,8 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
     return failure();
 
   // Only mul+add contraction is supported.
+  // Currently, there is no way to control the contraction body type in named
+  // and category ops which all default to mul+add only.
   if (!mlir::linalg::detail::isContractionBody(
           *genericOp.getBlock(), [](Operation *first, Operation *second) {
             return (isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
@@ -460,9 +462,11 @@ FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(
   }
 
   // Early exit in case of category specialization.
-  // TODO: Remove when all variants account for both named and category.
+  // TODO: Remove when matches for other ops account for both named and
+  // category.
   if (options.emitCategoryOps)
-    return failure();
+    return rewriter.notifyMatchFailure(
+        genericOp, "no matching category op specialization");
 
   // Copy
   if (isaCopyOpInterface(genericOp)) {
@@ -533,10 +537,11 @@ FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(
   }
 
   // Convolution - e.g. *conv/pooling*
-  if (isaConvolutionOpInterface(genericOp)) {
+  if (isaConvolutionOpInterface(genericOp))
     return specializeLinalgConvolutions(rewriter, genericOp);
-  }
-  return failure();
+
+  return rewriter.notifyMatchFailure(genericOp,
+                                     "no matching named op specialization");
 }
 
 namespace {



More information about the Mlir-commits mailing list