[Mlir-commits] [mlir] 9d25d5c - [mlir][linalg] Generic to category specialization (#184624)

Fri Mar 6 06:35:05 PST 2026

Author: Adam Siemieniuk
Date: 2026-03-06T15:35:00+01:00
New Revision: 9d25d5c4a7f7840ef7ea75633757469148f12ebf

URL: https://github.com/llvm/llvm-project/commit/9d25d5c4a7f7840ef7ea75633757469148f12ebf
DIFF: https://github.com/llvm/llvm-project/commit/9d25d5c4a7f7840ef7ea75633757469148f12ebf.diff

LOG: [mlir][linalg] Generic to category specialization (#184624)

Adds initial support for generic to category linalg morphism.
Only conversion to contraction op is supported for now.

Added: 
    mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir
    mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir

Modified: 
    mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
    mlir/include/mlir/Dialect/Linalg/Passes.td
    mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
    mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
    mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
    mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
    mlir/test/Dialect/Linalg/specialize-generic-ops.mlir

Removed: 
    mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 4948bfffad5e0..5998f736ced34 100644

--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -889,6 +889,13 @@ def ContractOp : LinalgStructuredBase_Op<"contract", [
 
   let skipDefaultBuilders = 1;
   let builders = [
+    OpBuilder<
+      (ins "ValueRange":$inputs, "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs,
+                          attributes, regionBuilder);
+      }]>,
     OpBuilder<(ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
       "ValueRange":$outputs, "ArrayAttr":$indexingMaps,
       CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),

diff  --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index f48ea9849e237..26638b2a644c4 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -70,8 +70,10 @@ def LinalgMorphOpsPass : Pass<"linalg-morph-ops"> {
     
     // Specialization path is not guaranteed.
     Option<"genericToNamed", "generic-to-named", "bool", /*default=*/"false",
-           "convert linalg.generic to equivalent named ops"> ];
-    //  TODOs: `generic-to-category`, `category-to-named`
+           "convert linalg.generic to equivalent named ops">,
+    Option<"genericToCategory", "generic-to-category", "bool", /*default=*/"false",
+           "convert linalg.generic to equivalent category ops"> ];
+    //  TODOs: `category-to-named`
 }
 
 def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops">,

diff  --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index fb9cede670801..dcb7f1f212207 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -923,10 +923,15 @@ FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
 FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
                                        LinalgOp linalgOp);
 
-/// Create a namedOp from the given GenericOp and replace the GenericOp.
-/// Currently we can specialize only trivial linalg copy operations.
-FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter,
-                                        GenericOp genericOp);
+struct GenericOpSpecializationOptions {
+  // Specialize generics to category ops (default: named ops).
+  bool emitCategoryOps = false;
+};
+
+/// Replace the given GenericOp with a namedOp or categoryOp.
+FailureOr<LinalgOp>
+specializeGenericOp(RewriterBase &rewriter, GenericOp genericOp,
+                    const GenericOpSpecializationOptions &options = {});
 
 /// Create a new buffer using the `allocationFn` provided. The size of this
 /// buffer is either the original subview size when 'useOriginalSubviewSize' is
@@ -1718,17 +1723,24 @@ struct LinalgGeneralizationPattern
 };
 
 struct LinalgSpecializationPattern : public OpRewritePattern<GenericOp> {
-  using OpRewritePattern<GenericOp>::OpRewritePattern;
+
+  LinalgSpecializationPattern(
+      MLIRContext *context, const GenericOpSpecializationOptions &options = {},
+      PatternBenefit benefit = 1)
+      : OpRewritePattern<GenericOp>(context, benefit), options(options) {}
 
   FailureOr<GenericOp>
   returningMatchAndRewrite(GenericOp op, PatternRewriter &rewriter) const {
-    return specializeGenericOp(rewriter, op);
+    return specializeGenericOp(rewriter, op, options);
   }
 
   LogicalResult matchAndRewrite(GenericOp op,
                                 PatternRewriter &rewriter) const override {
     return returningMatchAndRewrite(op, rewriter);
   }
+
+private:
+  GenericOpSpecializationOptions options;
 };
 
 /// Vectorization pattern for memref::CopyOp.
@@ -1938,13 +1950,14 @@ void populateLinalgTilingCanonicalizationPatterns(RewritePatternSet &patterns);
 void populateLinalgNamedOpsGeneralizationPatterns(RewritePatternSet &patterns);
 
 /// Populates `patterns` with patterns to convert linalg.generic ops to named
-/// ops where possible. A linalg.generic can represent wide range and complex
-/// computations for which equivalent linalg named op may not exist e.g.
-/// linalg.generic that takes a tensor and computes a polynomial such as:
+/// or category ops where possible. A linalg.generic can represent wide range
+/// and complex computations for which equivalent linalg named op may not exist
+/// e.g. linalg.generic that takes a tensor and computes a polynomial such as:
 ///     p(x) = an*x^n + ... + a1x + a0
 /// There is no equivalent named op to convert to. Many such cases exist.
 void populateLinalgGenericOpsSpecializationPatterns(
-    RewritePatternSet &patterns);
+    RewritePatternSet &patterns,
+    const GenericOpSpecializationOptions &options = {});
 
 /// Populates `patterns` that convert linalg named ops e.g. `linalg.add`
 /// to equivalent `linalg.elementwise`.

diff  --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index bfc03cc7436df..37b549a7fcd7f 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -193,14 +193,17 @@ static void buildMatmulOp(OpBuilder &b, OperationState &state,
                           ValueRange inputs, ValueRange outputs,
                           ArrayRef<NamedAttribute> attributes,
                           RegionBuilderFn regionBuilder,
-                          ArrayRef<AffineMap> indexingMaps) {
-  // Initialize indexingMaps attribute, for MatmulOp.
-  SmallVector<Attribute, 3> indexingMapsAttrVal;
-  indexingMapsAttrVal =
-      llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
-        return AffineMapAttr::get(map);
-      });
-  state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+                          ArrayRef<AffineMap> defaultIndexingMaps) {
+  // If indexing maps are not provided, apply the default ones.
+  if (none_of(attributes, [](NamedAttribute attr) {
+        return attr.getName() == "indexing_maps";
+      })) {
+    SmallVector<Attribute, 3> indexingMapsAttrVal;
+    indexingMapsAttrVal = llvm::map_to_vector(
+        defaultIndexingMaps,
+        [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  }
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }
@@ -210,14 +213,17 @@ static void buildBatchMatmulOp(OpBuilder &b, OperationState &state,
                                ValueRange inputs, ValueRange outputs,
                                ArrayRef<NamedAttribute> attributes,
                                RegionBuilderFn regionBuilder,
-                               ArrayRef<AffineMap> indexingMaps) {
-  // Initialize indexingMaps attribute, for BatchMatmulOp.
-  SmallVector<Attribute, 4> indexingMapsAttrVal;
-  indexingMapsAttrVal =
-      llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
-        return AffineMapAttr::get(map);
-      });
-  state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+                               ArrayRef<AffineMap> defaultIndexingMaps) {
+  // If indexing maps are not provided, apply the default ones.
+  if (none_of(attributes, [](NamedAttribute attr) {
+        return attr.getName() == "indexing_maps";
+      })) {
+    SmallVector<Attribute, 4> indexingMapsAttrVal;
+    indexingMapsAttrVal = llvm::map_to_vector(
+        defaultIndexingMaps,
+        [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  }
   return buildStructuredOp(b, state, resultTensorTypes, inputs, outputs,
                            attributes, regionBuilder);
 }

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
index f261ccb1415fe..fee293647deda 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/MorphOps.cpp
@@ -44,16 +44,16 @@ void LinalgMorphOpsPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
 
   // Lowering paths (named -> category -> generic)
-  if (namedToCategory) {
+  if (namedToCategory)
     populateLinalgNamedToElementwisePatterns(patterns);
-  }
-  if (namedToGeneric || categoryToGeneric) {
+  if (namedToGeneric || categoryToGeneric)
     populateLinalgNamedOpsGeneralizationPatterns(patterns);
-  }
 
   // Lifting paths (named <- category <- generic)
-  if (genericToNamed) {
-    populateLinalgGenericOpsSpecializationPatterns(patterns);
+  if (genericToNamed || genericToCategory) {
+    GenericOpSpecializationOptions opts;
+    opts.emitCategoryOps = genericToCategory;
+    populateLinalgGenericOpsSpecializationPatterns(patterns, opts);
   }
 
   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index b4de2bb1e1169..ee9fc77961bab 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -141,18 +141,28 @@ template <typename NamedOpTy>
 static LinalgOp replaceWithMatmulVariant(RewriterBase &rewriter, GenericOp op,
                                          std::optional<TypeFn> castTy,
                                          ArrayRef<AffineMap> indexingMaps) {
-  SmallVector<NamedAttribute> castAttrVec;
+  SmallVector<NamedAttribute> attributes;
   // Only explicitly specify the cast attribute for unsigned cast; signed is
   // the default for linalg.matmul/linalg.batch_matmul.
-  if (castTy.has_value() && *castTy == TypeFn::cast_unsigned)
-    castAttrVec = {rewriter.getNamedAttr(
-        "cast", TypeFnAttr::get(rewriter.getContext(), *castTy))};
+  if (castTy.has_value() && *castTy == TypeFn::cast_unsigned) {
+    auto castAttr = rewriter.getNamedAttr(
+        "cast", TypeFnAttr::get(rewriter.getContext(), *castTy));
+    attributes.push_back(castAttr);
+  }
 
-  auto namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(
+  // Set the original generic's maps to preserve operand indexing semantics like
+  // transposition.
+  SmallVector<Attribute, 3> indexingMapsAttrVal =
+      llvm::map_to_vector(indexingMaps, [](AffineMap map) -> Attribute {
+        return AffineMapAttr::get(map);
+      });
+  auto indexingMapsAttr = rewriter.getNamedAttr(
+      "indexing_maps", rewriter.getArrayAttr(indexingMapsAttrVal));
+  attributes.push_back(indexingMapsAttr);
+
+  LinalgOp namedOp = rewriter.replaceOpWithNewOp<NamedOpTy>(
       op, ValueRange{op.getDpsInputs()[0], op.getDpsInputs()[1]},
-      ValueRange{op.getDpsInits()[0]}, castAttrVec);
-
-  namedOp.setIndexingMapsAttr(rewriter.getAffineMapArrayAttr(indexingMaps));
+      ValueRange{op.getDpsInits()[0]}, attributes);
 
   return namedOp;
 }
@@ -208,7 +218,8 @@ static std::optional<TypeFn> getCastTypeForMatmulLikeOp(GenericOp genericOp) {
 
 // Converts linalg.generic to named linalg.*matmul* where possible.
 static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
-                                                        GenericOp genericOp) {
+                                                        GenericOp genericOp,
+                                                        bool emitCategoryOp) {
   if (genericOp.getNumDpsInputs() != 2 || genericOp.getNumDpsInits() != 1)
     return failure();
 
@@ -218,6 +229,31 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
                    [](AffineMap m) { return !m.isProjectedPermutation(); }))
     return failure();
 
+  // Only mul+add contraction is supported.
+  // Currently, there is no way to control the contraction body type in named
+  // and category ops which all default to mul+add only.
+  if (!mlir::linalg::detail::isContractionBody(
+          *genericOp.getBlock(), [](Operation *first, Operation *second) {
+            return (isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
+                   (isa<arith::MulIOp>(first) && isa<arith::AddIOp>(second)) ||
+                   (isa<complex::MulOp>(first) && isa<complex::AddOp>(second));
+          }))
+    return failure();
+
+  // Determine the cast type for the named matmul op, or bail out if casts
+  // cannot be represented by the named op.
+  std::optional<TypeFn> castTy = getCastTypeForMatmulLikeOp(genericOp);
+  if (!castTy)
+    return rewriter.notifyMatchFailure(
+        genericOp, "contains invalid cast ops for the named matmul op");
+
+  // In case of category op, wider range of variants is supported.
+  if (emitCategoryOp)
+    return replaceWithMatmulVariant<ContractOp>(
+        rewriter, genericOp, castTy, genericOp.getIndexingMapsArray());
+
+  // Further checks for named variants.
+  //
   // Linalg generic contraction can be across multiple axis e.g.
   // ```
   //      linalg.generic
@@ -244,14 +280,6 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
   if (dims.m.size() != 1 || dims.n.size() != 1 || dims.k.size() != 1)
     return failure();
 
-  if (!mlir::linalg::detail::isContractionBody(
-          *genericOp.getBlock(), [](Operation *first, Operation *second) {
-            return (isa<arith::MulFOp>(first) && isa<arith::AddFOp>(second)) ||
-                   (isa<arith::MulIOp>(first) && isa<arith::AddIOp>(second)) ||
-                   (isa<complex::MulOp>(first) && isa<complex::AddOp>(second));
-          }))
-    return failure();
-
   // Check rank of operands
   auto indexingMaps = genericOp.getIndexingMapsArray();
   if (llvm::any_of(indexingMaps, [&dims](AffineMap m) {
@@ -290,13 +318,6 @@ static FailureOr<LinalgOp> specializeLinalgContractions(RewriterBase &rewriter,
   if (llvm::is_contained({a, b, c}, IndexMatchResult::Mismatch))
     return failure();
 
-  // Determine the cast type for the named matmul op, or bail out if casts
-  // cannot be represented by the named op.
-  std::optional<TypeFn> castTy = getCastTypeForMatmulLikeOp(genericOp);
-  if (!castTy)
-    return rewriter.notifyMatchFailure(
-        genericOp, "contains invalid cast ops for the named matmul op");
-
   // Build indexing maps for the named op in its canonical dimension ordering
   auto *ctx = genericOp.getContext();
   unsigned numLoopDims = numOfBatchDims + 3;
@@ -431,8 +452,22 @@ static FailureOr<LinalgOp> specializeLinalgConvolutions(RewriterBase &rewriter,
 //===----------------------------------------------------------------------===//
 // Categorize linalg generic to named op where possible.
 //===----------------------------------------------------------------------===//
-FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
-                                                      GenericOp genericOp) {
+FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(
+    RewriterBase &rewriter, GenericOp genericOp,
+    const GenericOpSpecializationOptions &options) {
+  // Contraction - e.g. matmul
+  if (isaContractionOpInterface(genericOp)) {
+    return specializeLinalgContractions(rewriter, genericOp,
+                                        options.emitCategoryOps);
+  }
+
+  // Early exit in case of category specialization.
+  // TODO: Remove when matches for other ops account for both named and
+  // category.
+  if (options.emitCategoryOps)
+    return rewriter.notifyMatchFailure(
+        genericOp, "no matching category op specialization");
+
   // Copy
   if (isaCopyOpInterface(genericOp)) {
     LinalgOp namedOp = rewriter.replaceOpWithNewOp<CopyOp>(
@@ -501,16 +536,12 @@ FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
     }
   }
 
-  // Contraction - e.g. matmul
-  if (isaContractionOpInterface(genericOp)) {
-    return specializeLinalgContractions(rewriter, genericOp);
-  }
-
   // Convolution - e.g. *conv/pooling*
-  if (isaConvolutionOpInterface(genericOp)) {
+  if (isaConvolutionOpInterface(genericOp))
     return specializeLinalgConvolutions(rewriter, genericOp);
-  }
-  return failure();
+
+  return rewriter.notifyMatchFailure(genericOp,
+                                     "no matching named op specialization");
 }
 
 namespace {
@@ -534,6 +565,7 @@ void LinalgSpecializeGenericOpsPass::runOnOperation() {
 }
 
 void mlir::linalg::populateLinalgGenericOpsSpecializationPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<LinalgSpecializationPattern>(patterns.getContext());
+    RewritePatternSet &patterns,
+    const GenericOpSpecializationOptions &options) {
+  patterns.add<LinalgSpecializationPattern>(patterns.getContext(), options);
 }

diff  --git a/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
deleted file mode 100644
index f15ae646e5765..0000000000000
--- a/mlir/test/Dialect/Linalg/roundtrip-linalg-named-ops.mlir
+++ /dev/null
@@ -1,72 +0,0 @@
-// The following test examples of linalg named ops lowered to linalg.generic and then
-// lifted back up to named op.
-// RUN: mlir-opt %s -linalg-generalize-named-ops | mlir-opt --linalg-specialize-generic-ops | FileCheck %s
-
-func.func @unary_exp(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
-  linalg.exp ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
-  return
-}
-
-// CHECK-LABEL: unary_exp
-// CHECK-SAME: %[[A:.+]]: memref<7x14x21xf32>, %[[Out:.+]]: memref<7x14x21xf32>)
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.exp ins(%[[A]] : memref<7x14x21xf32>) outs(%[[Out]] : memref<7x14x21xf32>)
-
-// -----
-
-func.func @binary_add(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = linalg.add ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: binary_add
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.add ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// -----
-
-
-///----------------------------------------------------------------------------------------
-/// Tests for linalg.matmul
-///----------------------------------------------------------------------------------------
-
-func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @matmul
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// -----
-
-// Check matmul with unsigned cast is correctly raised back to named op.
-func.func @matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
-                                %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
-  %0 = linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-                     ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>)
-                     outs(%Out : tensor<16x32xi32>) -> tensor<16x32xi32>
-  return %0 : tensor<16x32xi32>
-}
-
-// CHECK-LABEL: @matmul_unsigned_cast
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-
-// -----
-
-func.func @mixed_named_ops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                   %C: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %AB = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %1 = linalg.add ins(%AB, %C : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @mixed_named_ops
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[C:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: %[[AB:.+]] = linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK: linalg.add ins(%[[AB]], %[[C]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>

diff  --git a/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir
new file mode 100644
index 0000000000000..d5e49a866eaec
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-category-ops.mlir
@@ -0,0 +1,107 @@
+// The following test examples of linalg category ops lowered to linalg.generic
+// and then lifted back up to category op.
+
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=category-to-generic \
+// RUN: | mlir-opt -split-input-file -linalg-morph-ops=generic-to-category \
+// RUN: | FileCheck %s
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @contract_matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
+    %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL: contract_matmul
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+
+func.func @contract_matmul_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
+    %arg2: memref<?x?xf32>) {
+  linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : memref<?x?xf32>, memref<?x?xf32>)
+    outs(%arg2 : memref<?x?xf32>)
+  return
+}
+
+// CHECK-LABEL: contract_matmul_memref
+// CHECK-SAME: %[[A:.+]]: memref<?x?xf32>, %[[B:.+]]: memref<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: memref<?x?xf32>)
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-SAME: ins(%[[A]], %[[B]] : memref<?x?xf32>, memref<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<?x?xf32>)
+
+func.func @contract_matmul_bitcast_int_to_float(%arg0: tensor<16x8xi32>,
+    %arg1: tensor<8x32xi32>, %arg2: tensor<16x32xf32>) -> tensor<16x32xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : tensor<16x8xi32>, tensor<8x32xi32>)
+    outs(%arg2 : tensor<16x32xf32>) -> tensor<16x32xf32>
+  return %0 : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: contract_matmul_bitcast_int_to_float
+// CHECK-SAME: %[[A:.+]]: tensor<16x8xi32>, %[[B:.+]]: tensor<8x32xi32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-NOT: cast =
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xi32>, tensor<8x32xi32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+
+func.func @contract_matmul_unsigned_cast_float(%arg0: tensor<16x8xi16>,
+    %arg1: tensor<8x32xi16>, %arg2: tensor<16x32xf32>) -> tensor<16x32xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    {cast = #linalg.type_fn<cast_unsigned>}
+    ins(%arg0, %arg1 : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%arg2 : tensor<16x32xf32>) -> tensor<16x32xf32>
+  return %0 : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: contract_matmul_unsigned_cast_float
+// CHECK-SAME: %[[A:.+]]: tensor<16x8xi16>, %[[B:.+]]: tensor<8x32xi16>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CHECK-SAME: cast = #linalg.type_fn<cast_unsigned>
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<16x8xi16>, tensor<8x32xi16>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+
+// -----
+
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2, d1)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+func.func @contract_multi_reduction(%arg0: tensor<10x20x30xf32>,
+    %arg1: tensor<30x20x40xf32>, %arg2: tensor<10x40xf32>) -> tensor<10x40xf32> {
+  %0 = linalg.contract indexing_maps = [#map, #map1, #map2]
+    ins(%arg0, %arg1 : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
+    outs(%arg2 : tensor<10x40xf32>) -> tensor<10x40xf32>
+  return %0 : tensor<10x40xf32>
+}
+
+// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2, d1)>
+// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+
+// CHECK-LABEL: contract_multi_reduction
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.contract
+// CHECK-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}

diff  --git a/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
new file mode 100644
index 0000000000000..19b30ef10da84
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
@@ -0,0 +1,99 @@
+// The following test examples of linalg named ops lowered to linalg.generic
+// and then lifted back up to named op.
+
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=named-to-generic \
+// RUN: | mlir-opt -split-input-file -linalg-morph-ops=generic-to-named \
+// RUN: | FileCheck %s
+
+func.func @unary_exp(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
+  linalg.exp ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK-LABEL: unary_exp
+// CHECK-SAME: %[[A:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.exp
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// -----
+
+func.func @binary_add(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.add
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: binary_add
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.add
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matmul
+///----------------------------------------------------------------------------------------
+
+func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @matmul
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// -----
+
+// Check matmul with unsigned cast is correctly raised back to named op.
+func.func @matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
+    %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+  %0 = linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>)
+    outs(%Out : tensor<16x32xi32>) -> tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
+}
+
+// CHECK-LABEL: @matmul_unsigned_cast
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.matmul
+// CHECK-SAME: {cast = #linalg.type_fn<cast_unsigned>}
+
+// -----
+
+func.func @mixed_named_ops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+    %C: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %AB = linalg.matmul
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.add
+    ins(%AB, %C : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @mixed_named_ops
+// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[C:.+]]: tensor<?x?xf32>,
+// CHECK-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: %[[AB:.+]] = linalg.matmul
+// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK: linalg.add
+// CHECK-SAME: ins(%[[AB]], %[[C]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>

diff  --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index 5c58a5fedd639..029d11a4f60de 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -1,10 +1,16 @@
-// RUN: mlir-opt %s -split-input-file --linalg-specialize-generic-ops | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-named \
+// RUN: | FileCheck %s --check-prefix=NAMED,ALL
+
+// RUN: mlir-opt %s -split-input-file -linalg-morph-ops=generic-to-category \
+// RUN: | FileCheck %s --check-prefix=CATEGORY,ALL
 
 #umap = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = linalg.generic
-          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
-          ins(%A : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+    {indexing_maps = [#umap, #umap],
+    iterator_types = ["parallel", "parallel","parallel"]}
+    ins(%A : tensor<?x?x?xf32>)
+    outs(%Out : tensor<?x?x?xf32>) {
   ^bb0(%in: f32, %out: f32):
     %1 = math.exp %in : f32
     linalg.yield %1 : f32
@@ -12,18 +18,27 @@ func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tenso
   return %0 : tensor<?x?x?xf32>
 }
 
-// CHECK-LABEL: unary_op_exp
-// CHECK-SAME: %[[A:.+]]: tensor<?x?x?xf32>, %[[Out:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.exp ins(%[[A]] : tensor<?x?x?xf32>) outs(%[[Out]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// ALL-LABEL: unary_op_exp
+// ALL-SAME: %[[A:.+]]: tensor<?x?x?xf32>, %[[OUT:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.exp
+// NAMED-SAME: ins(%[[A]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+
+// Not supported yet.
+// CATEGORY: linalg.generic
 
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                         %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+    {indexing_maps = [#map, #map, #map],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.divf %in, %in_0 : f32
     linalg.yield %1 : f32
@@ -31,10 +46,17 @@ func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<
   return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: binary_op_div
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.div ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-LABEL: binary_op_div
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// ALL-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.div
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// Not supported yet.
+// CATEGORY: linalg.generic
 
 // -----
 
@@ -45,22 +67,39 @@ func.func @binary_op_div(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                     %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: op_matmul
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,  %[[Out:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>,
+// ALL-SAME: %[[OUT:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // Cast-auditing tests: ensure we only specialize when the cast semantics can
 // be expressed by linalg.matmul, and use the cast attribute when needed.
@@ -69,8 +108,10 @@ func.func @op_matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?x
 func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
                                    %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi32>) outs(%Out : tensor<16x32xi32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi32>)
+    outs(%Out : tensor<16x32xi32>) {
   ^bb0(%in: i16, %in_0: i32, %out: i32):
     %1 = arith.extui %in : i16 to i32
     %3 = arith.muli %1, %in_0 : i32
@@ -80,9 +121,15 @@ func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
   return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: op_matmul_unsigned_cast
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_matmul_unsigned_cast
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
 
 // Ensures truncation rounding is tolerated with unsigned cases.
 // Note: We only consider casts as conflicting if they have 
diff erent
@@ -94,8 +141,10 @@ func.func @op_matmul_unsigned_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi32>,
 func.func @op_matmul_unsigned_cast_and_truncate(%A: tensor<16x8xi16>, %B: tensor<8x32xi64>,
                                                 %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>) outs(%Out : tensor<16x32xi32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi64>)
+    outs(%Out : tensor<16x32xi32>) {
   ^bb0(%in: i16, %in_0: i64, %out: i32):
     %1 = arith.extui %in : i16 to i32
     %2 = arith.trunci %in_0 : i64 to i32
@@ -106,70 +155,98 @@ func.func @op_matmul_unsigned_cast_and_truncate(%A: tensor<16x8xi16>, %B: tensor
   return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: op_matmul_unsigned_cast_and_truncate
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_matmul_unsigned_cast_and_truncate
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
 
 // Signed casts are the default, no cast attribute is required.
 func.func @op_matmul_signed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                  %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi32>) {
-   ^bb0(%in: i16, %in_0: i16, %out: i32):
-     %1 = arith.extsi %in : i16 to i32
-     %2 = arith.extsi %in_0 : i16 to i32
-     %3 = arith.muli %1, %2 : i32
-     %4 = arith.addi %out, %3 : i32
-     linalg.yield %4 : i32
-   } -> tensor<16x32xi32>
-   return %0 : tensor<16x32xi32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xi32>) {
+  ^bb0(%in: i16, %in_0: i16, %out: i32):
+    %1 = arith.extsi %in : i16 to i32
+    %2 = arith.extsi %in_0 : i16 to i32
+    %3 = arith.muli %1, %2 : i32
+    %4 = arith.addi %out, %3 : i32
+    linalg.yield %4 : i32
+  } -> tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: op_matmul_signed_cast
-// CHECK-NOT: linalg.generic
-// CHECK-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-// CHECK: linalg.matmul
+// ALL-LABEL: op_matmul_signed_cast
+
+// NAMED-NOT: linalg.generic
+// NAMED-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// NAMED: linalg.matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: %[[RES:.+]] = linalg.contract
+// CATEGORY-SAME: indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-NOT: {cast =
+// CATEGORY-SAME: ins
+// CATEGORY: return %[[RES]]
 
 // Mixed signed/unsigned inputs cannot be encoded with a single cast attribute.
 func.func @negative_op_matmul_mixed_cast(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
-                                %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
+                                         %Out: tensor<16x32xi32>) -> tensor<16x32xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xi32>) {
-   ^bb0(%in: i16, %in_0: i16, %out: i32):
-     %1 = arith.extui %in : i16 to i32
-     %2 = arith.extsi %in_0 : i16 to i32
-     %3 = arith.muli %1, %2 : i32
-     %4 = arith.addi %out, %3 : i32
-     linalg.yield %4 : i32
-   } -> tensor<16x32xi32>
-   return %0 : tensor<16x32xi32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xi32>) {
+  ^bb0(%in: i16, %in_0: i16, %out: i32):
+    %1 = arith.extui %in : i16 to i32
+    %2 = arith.extsi %in_0 : i16 to i32
+    %3 = arith.muli %1, %2 : i32
+    %4 = arith.addi %out, %3 : i32
+    linalg.yield %4 : i32
+  } -> tensor<16x32xi32>
+  return %0 : tensor<16x32xi32>
 }
 
-// CHECK-LABEL: negative_op_matmul_mixed_cast
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.matmul
+// ALL-LABEL: negative_op_matmul_mixed_cast
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.matmul
+
+// CATEGORY: linalg.generic
+// CATEGORY-NOT: linalg.contract
 
 // Output-side casts are not representable by the named matmul ops.
 func.func @negative_op_matmul_output_cast(%A: tensor<16x8xi32>, %B: tensor<8x32xi32>,
-                                 %Out: tensor<16x32xi64>) -> tensor<16x32xi64> {
+                                          %Out: tensor<16x32xi64>) -> tensor<16x32xi64> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>) outs(%Out : tensor<16x32xi64>) {
-   ^bb0(%in: i32, %in_0: i32, %out: i64):
-     %3 = arith.trunci %out : i64 to i32
-     %4 = arith.muli %in, %in_0 : i32
-     %5 = arith.addi %3, %4 : i32
-     %6 = arith.extsi %5 : i32 to i64
-     linalg.yield %6 : i64
-   } -> tensor<16x32xi64>
-   return %0 : tensor<16x32xi64>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>)
+    outs(%Out : tensor<16x32xi64>) {
+  ^bb0(%in: i32, %in_0: i32, %out: i64):
+    %3 = arith.trunci %out : i64 to i32
+    %4 = arith.muli %in, %in_0 : i32
+    %5 = arith.addi %3, %4 : i32
+    %6 = arith.extsi %5 : i32 to i64
+    linalg.yield %6 : i64
+  } -> tensor<16x32xi64>
+  return %0 : tensor<16x32xi64>
 }
 
-// CHECK-LABEL: negative_op_matmul_output_cast
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.matmul
+// ALL-LABEL: negative_op_matmul_output_cast
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.matmul
+
+// CATEGORY: linalg.generic
+// CATEGORY-NOT: linalg.contract
 
 // Bitcasts are not modeled by the cast attribute, but should not block
 // specialization.
@@ -180,8 +257,10 @@ func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
                                           %B: tensor<8x32xi32>,
                                           %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>) outs(%Out : tensor<16x32xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi32>, tensor<8x32xi32>)
+    outs(%Out : tensor<16x32xf32>) {
   ^bb0(%in: i32, %in_0: i32, %out: f32):
     %1 = arith.bitcast %in : i32 to f32
     %2 = arith.bitcast %in_0 : i32 to f32
@@ -192,16 +271,22 @@ func.func @op_matmul_bitcast_int_to_float(%A: tensor<16x8xi32>,
   return %0 : tensor<16x32xf32>
 }
 
-// CHECK-LABEL: op_matmul_bitcast_int_to_float
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
+// ALL-LABEL: op_matmul_bitcast_int_to_float
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // Signed float casts only use sitofp, which defaults to signed semantics.
 func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                        %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xf32>) {
   ^bb0(%in: i16, %in_0: i16, %out: f32):
     %1 = arith.sitofp %in : i16 to f32
     %2 = arith.sitofp %in_0 : i16 to f32
@@ -212,17 +297,25 @@ func.func @op_matmul_signed_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16
   return %0 : tensor<16x32xf32>
 }
 
-// CHECK-LABEL: op_matmul_signed_cast_float
-// CHECK-NOT: linalg.generic
-// CHECK-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
-// CHECK: linalg.matmul
+// ALL-LABEL: op_matmul_signed_cast_float
+
+// NAMED-NOT: linalg.generic
+// NAMED-NOT: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// NAMED: linalg.matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY-NOT: linalg.contract{{.*}}{cast =
+// CATEGORY: %[[RES:.+]] = linalg.contract
+// CATEGORY: return %[[RES]]
 
 // Unsigned float casts are expressed via uitofp and use the unsigned cast attr.
 func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi16>,
                                          %Out: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>) outs(%Out : tensor<16x32xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xi16>, tensor<8x32xi16>)
+    outs(%Out : tensor<16x32xf32>) {
   ^bb0(%in: i16, %in_0: i16, %out: f32):
     %1 = arith.uitofp %in : i16 to f32
     %2 = arith.uitofp %in_0 : i16 to f32
@@ -233,9 +326,13 @@ func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi
   return %0 : tensor<16x32xf32>
 }
 
-// CHECK-LABEL: op_matmul_unsigned_cast_float
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_matmul_unsigned_cast_float
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul {cast = #linalg.type_fn<cast_unsigned>}
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract{{.*}}{cast = #linalg.type_fn<cast_unsigned>}
 
 // -----
 
@@ -246,10 +343,13 @@ func.func @op_matmul_unsigned_cast_float(%A: tensor<16x8xi16>, %B: tensor<8x32xi
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>, %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>,
+                           %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -258,10 +358,20 @@ func.func @op_batch_matmul(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>, %Out:
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: op_batch_matmul
-// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[Out:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// ALL-LABEL: op_batch_matmul
+// ALL-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>,  %[[OUT:.+]]: tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
 // Ensure that the unsigned cast path for cast detection is exercised for
 // batch_matmul as well.
@@ -269,38 +379,43 @@ func.func @op_batch_matmul_unsigned_cast(%A: tensor<2x16x8xi16>,
                                          %B: tensor<2x8x16xi64>,
                                          %Out: tensor<2x16x16xi32>) -> tensor<2x16x16xi32> {
   %0 = linalg.generic
-         {indexing_maps = [#map, #map1, #map2],
-          iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<2x16x8xi16>, tensor<2x8x16xi64>)
-         outs(%Out : tensor<2x16x16xi32>) {
-   ^bb0(%in: i16, %in_0: i64, %out: i32):
-     %1 = arith.extui %in : i16 to i32
-     %2 = arith.trunci %in_0 : i64 to i32
-     %3 = arith.muli %1, %2 : i32
-     %4 = arith.addi %out, %3 : i32
-     linalg.yield %4 : i32
-   } -> tensor<2x16x16xi32>
-   return %0 : tensor<2x16x16xi32>
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x16x8xi16>, tensor<2x8x16xi64>)
+    outs(%Out : tensor<2x16x16xi32>) {
+  ^bb0(%in: i16, %in_0: i64, %out: i32):
+    %1 = arith.extui %in : i16 to i32
+    %2 = arith.trunci %in_0 : i64 to i32
+    %3 = arith.muli %1, %2 : i32
+    %4 = arith.addi %out, %3 : i32
+    linalg.yield %4 : i32
+  } -> tensor<2x16x16xi32>
+  return %0 : tensor<2x16x16xi32>
 }
 
-// CHECK-LABEL: op_batch_matmul_unsigned_cast
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul {cast = #linalg.type_fn<cast_unsigned>}
+// ALL-LABEL: op_batch_matmul_unsigned_cast
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul {cast = #linalg.type_fn<cast_unsigned>}
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract indexing_maps = {{\[}}#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]{{\]}}
+// CATEGORY-SAME: {cast = #linalg.type_fn<cast_unsigned>}
 
 // -----
 
-// This is a multi-reduction linalg.generic and cannot be lifted to matrix multiply
+// A multi-reduction contraction.
 #mapA = affine_map<(m, n, k1, k2) -> (m, k1, k2)>
 #mapB = affine_map<(m, n, k1, k2) -> (k2, k1, n)>
 #mapC = affine_map<(m, n, k1, k2) -> (m, n)>
-func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
-                                       %B: tensor<30x20x40xf32>,
-                                       %C: tensor<10x40xf32>) -> tensor<10x40xf32> {
+func.func @op_multi_reduction(%A: tensor<10x20x30xf32>,
+                              %B: tensor<30x20x40xf32>,
+                              %C: tensor<10x40xf32>) -> tensor<10x40xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#mapA, #mapB, #mapC],
-            iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
-  ins(%A, %B : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
-  outs(%C : tensor<10x40xf32>) {
+    {indexing_maps = [#mapA, #mapB, #mapC],
+    iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
+    ins(%A, %B : tensor<10x20x30xf32>, tensor<30x20x40xf32>)
+    outs(%C : tensor<10x40xf32>) {
   ^bb0(%a: f32, %b: f32, %c: f32):
     %1 = arith.mulf %a, %b : f32
     %2 = arith.addf %c, %1 : f32
@@ -309,8 +424,13 @@ func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
   return %0 : tensor<10x40xf32>
 }
 
-// CHECK-LABEL: negative_op_multi_reduction
-// CHECK: linalg.generic
+// ALL-LABEL: op_multi_reduction
+
+// Cannot be lifted to named matrix multiply.
+// NAMED: linalg.generic
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
@@ -319,11 +439,13 @@ func.func @negative_op_multi_reduction(%A: tensor<10x20x30xf32>,
 #mapBni0 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
 #mapBni1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #mapBni2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8x16xf32>,
-                                                     %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
+func.func @batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: tensor<2x8x16xf32>,
+                                           %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#mapBni0, #mapBni1, #mapBni2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<4x2x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x4x16xf32>) {
+    {indexing_maps = [#mapBni0, #mapBni1, #mapBni2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<4x2x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x4x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -332,28 +454,40 @@ func.func @negative_batch_matmul_non_identity_batch(%A: tensor<4x2x8xf32>, %B: t
   return %0 : tensor<2x4x16xf32>
 }
 
-// CHECK-LABEL: negative_batch_matmul_non_identity_batch
-// CHECK: linalg.generic
+// ALL-LABEL: batch_matmul_non_identity_batch
+
+// Cannot be lifted to named matrix multiply.
+// NAMED: linalg.generic
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
-// TODO: matvec
+// TODO: named matvec
 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
 func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>) -> tensor<?xf32> {
   %0 = linalg.generic
-          {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction"]}
-          ins(%A, %B : tensor<?x?xf32>, tensor<?xf32>) outs(%Out : tensor<?xf32>) {
-      ^bb0(%in: f32, %in_0: f32, %out: f32):
-        %1 = arith.mulf %in, %in_0 : f32
-        %2 = arith.addf %out, %1 : f32
-        linalg.yield %2 : f32
+    {indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?xf32>)
+    outs(%Out : tensor<?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
   } -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
-// CHECK-LABEL: op_matvec
-// CHECK: linalg.generic
+
+// ALL-LABEL: op_matvec
+
+// NAMED: linalg.generic
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
@@ -361,28 +495,36 @@ func.func @op_matvec(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %Out: tensor<?xf32>
 #map_ta = affine_map<(d0, d1, d2) -> (d2, d0)>
 #map_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_c = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                                 %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_ta, #map_b, #map_c], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_ta, #map_b, #map_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_transpose_a
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// ALL-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul_transpose_a
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
 
 // -----
 
@@ -390,28 +532,39 @@ func.func @op_matmul_transpose_a(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
 #map_a = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map_tb = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map_c = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
+                                 %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_a, #map_tb, #map_c], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_a, #map_tb, #map_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// ALL-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -419,10 +572,13 @@ func.func @op_matmul_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %Out:
 #map_ta = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
 #map_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16xf32>, %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
+func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16xf32>,
+                                       %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_ta, #map_b, #map_c], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x8x4xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x4x16xf32>) {
+    {indexing_maps = [#map_ta, #map_b, #map_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x4x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -431,16 +587,24 @@ func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16x
   return %0 : tensor<2x4x16xf32>
 }
 
-// CHECK-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
-// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CHECK-LABEL: op_batch_matmul_transpose_a
-// CHECK-SAME: %[[A:.+]]: tensor<2x8x4xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+// ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
+// ALL-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// ALL-LABEL: op_batch_matmul_transpose_a
+// ALL-SAME: %[[A:.+]]: tensor<2x8x4xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[OUT:.+]]: tensor<2x4x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_B]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x8x4xf32>, tensor<2x8x16xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
 // -----
 
@@ -448,10 +612,13 @@ func.func @op_batch_matmul_transpose_a(%A: tensor<2x8x4xf32>, %B: tensor<2x8x16x
 #map_a = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map_tb = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
 #map_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8xf32>, %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
+func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8xf32>,
+                                       %Out: tensor<2x4x16xf32>) -> tensor<2x4x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_a, #map_tb, #map_c], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x4x8xf32>, tensor<2x16x8xf32>) outs(%Out : tensor<2x4x16xf32>) {
+    {indexing_maps = [#map_a, #map_tb, #map_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
+    outs(%Out : tensor<2x4x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -460,16 +627,24 @@ func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8x
   return %0 : tensor<2x4x16xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CHECK-LABEL: op_batch_matmul_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<2x4x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[Out:.+]]: tensor<2x4x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+// ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// ALL-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// ALL-LABEL: op_batch_matmul_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<2x4x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[OUT:.+]]: tensor<2x4x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x16x8xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x4x16xf32>) -> tensor<2x4x16xf32>
 
 // -----
 
@@ -478,28 +653,38 @@ func.func @op_batch_matmul_transpose_b(%A: tensor<2x4x8xf32>, %B: tensor<2x16x8x
 #map_tb = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map_c = affine_map<(d0, d1, d2) -> (d0, d1)>
 func.func @op_matmul_transpose_a_and_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                        %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                       %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_ta, #map_tb, #map_c], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_ta, #map_tb, #map_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_transpose_a_and_b
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_TA:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// ALL-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// ALL-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// ALL-LABEL: op_matmul_transpose_a_and_b
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_TA]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -508,28 +693,38 @@ func.func @op_matmul_transpose_a_and_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
 #map_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_tc = affine_map<(d0, d1, d2) -> (d1, d0)>
 func.func @op_matmul_transposed_output(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                        %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                       %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_a, #map_b, #map_tc], iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_a, #map_b, #map_tc],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK-DAG: #[[$MAP_TC:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
-// CHECK-LABEL: op_matmul_transposed_output
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// ALL-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// ALL-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// ALL-DAG: #[[$MAP_TC:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+
+// ALL-LABEL: op_matmul_transposed_output
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_TC]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -538,24 +733,37 @@ func.func @op_matmul_transposed_output(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
 #map_nc_b = affine_map<(m, k, n) -> (k, n)>
 #map_nc_c = affine_map<(m, k, n) -> (m, n)>
 func.func @op_matmul_non_canonical_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                            %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                         %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_nc_a, #map_nc_b, #map_nc_c],
-          iterator_types = ["parallel", "reduction", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_nc_a, #map_nc_b, #map_nc_c],
+    iterator_types = ["parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: op_matmul_non_canonical_loops
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+// ALL-LABEL: op_matmul_non_canonical_loops
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
 
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 // -----
 
 // Batch matmul with non-canonical loop ordering.
@@ -563,11 +771,12 @@ func.func @op_matmul_non_canonical_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32
 #map_bnc_b = affine_map<(batch, m, k, n) -> (batch, k, n)>
 #map_bnc_c = affine_map<(batch, m, k, n) -> (batch, m, n)>
 func.func @op_batch_matmul_non_canonical_loops(%A: tensor<2x16x8xf32>, %B: tensor<2x8x16xf32>,
-                                                  %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+                                               %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bnc_a, #map_bnc_b, #map_bnc_c],
-            iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bnc_a, #map_bnc_b, #map_bnc_c],
+    iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -576,10 +785,23 @@ func.func @op_batch_matmul_non_canonical_loops(%A: tensor<2x16x8xf32>, %B: tenso
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: op_batch_matmul_non_canonical_loops
-// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[Out:.+]]: tensor<2x16x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>) outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+
+// ALL-LABEL: op_batch_matmul_non_canonical_loops
+// ALL-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x8x16xf32>, %[[OUT:.+]]: tensor<2x16x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_B]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x8x16xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
 // -----
 
@@ -588,29 +810,41 @@ func.func @op_batch_matmul_non_canonical_loops(%A: tensor<2x16x8xf32>, %B: tenso
 #map_nc_tb_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_nc_tb_c = affine_map<(d0, d1, d2) -> (d0, d2)>
 func.func @op_matmul_non_canonical_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                                %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                               %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_nc_tb_a, #map_nc_tb_b, #map_nc_tb_c],
-          iterator_types = ["parallel", "reduction", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_nc_tb_a, #map_nc_tb_b, #map_nc_tb_c],
+    iterator_types = ["parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: op_matmul_non_canonical_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// NAMED-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// NAMED-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// NAMED-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+
+// ALL-LABEL: op_matmul_non_canonical_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -620,11 +854,12 @@ func.func @op_matmul_non_canonical_transpose_b(%A: tensor<?x?xf32>, %B: tensor<?
 #map_bnc_tb_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_bnc_tb_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 func.func @op_batch_matmul_non_canonical_transpose_b(%A: tensor<2x16x8xf32>, %B: tensor<2x16x8xf32>,
-                                                      %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+                                                     %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bnc_tb_a, #map_bnc_tb_b, #map_bnc_tb_c],
-            iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<2x16x8xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bnc_tb_a, #map_bnc_tb_b, #map_bnc_tb_c],
+    iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -633,16 +868,28 @@ func.func @op_batch_matmul_non_canonical_transpose_b(%A: tensor<2x16x8xf32>, %B:
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-// CHECK-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-// CHECK-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-// CHECK-LABEL: op_batch_matmul_non_canonical_transpose_b
-// CHECK-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[Out:.+]]: tensor<2x16x16xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.batch_matmul
-// CHECK-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
-// CHECK-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
-// CHECK-SAME: outs(%[[Out]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+// NAMED-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// NAMED-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+// NAMED-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+// CATEGORY-DAG: #[[$MAP_TB:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+
+// ALL-LABEL: op_batch_matmul_non_canonical_transpose_b
+// ALL-SAME: %[[A:.+]]: tensor<2x16x8xf32>, %[[B:.+]]: tensor<2x16x8xf32>, %[[OUT:.+]]: tensor<2x16x16xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.batch_matmul
+// NAMED-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<2x16x8xf32>, tensor<2x16x8xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
 
 // -----
 
@@ -651,23 +898,37 @@ func.func @op_batch_matmul_non_canonical_transpose_b(%A: tensor<2x16x8xf32>, %B:
 #map_fs_b = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map_fs_c = affine_map<(d0, d1, d2) -> (d1, d2)>
 func.func @op_matmul_fully_shuffled_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>,
-                                           %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+                                          %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_fs_a, #map_fs_b, #map_fs_c],
-          iterator_types = ["reduction", "parallel", "parallel"]}
-         ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_fs_a, #map_fs_b, #map_fs_c],
+    iterator_types = ["reduction", "parallel", "parallel"]}
+    ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: op_matmul_fully_shuffled_loops
-// CHECK-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[Out:.+]]: tensor<?x?xf32>
-// CHECK-NOT: linalg.generic
-// CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[Out]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CATEGORY-DAG: #[[$MAP_A:.+]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+// CATEGORY-DAG: #[[$MAP_B:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CATEGORY-DAG: #[[$MAP_C:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+
+// ALL-LABEL: op_matmul_fully_shuffled_loops
+// ALL-SAME: %[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[OUT:.+]]: tensor<?x?xf32>
+
+// NAMED-NOT: linalg.generic
+// NAMED: linalg.matmul
+// NAMED-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
+// CATEGORY-SAME: indexing_maps = [#[[$MAP_A]], #[[$MAP_TB]], #[[$MAP_C]]]
+// CATEGORY-SAME: ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CATEGORY-SAME: outs(%[[OUT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 
 // -----
 
@@ -675,23 +936,28 @@ func.func @op_matmul_fully_shuffled_loops(%A: tensor<?x?xf32>, %B: tensor<?x?xf3
 #map_bcast_a = affine_map<(d0, d1, d2) -> (d2)>
 #map_bcast_b = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map_bcast_c = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @negative_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
-                                        %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
+func.func @op_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
+                                 %Out: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic
-         {indexing_maps = [#map_bcast_a, #map_bcast_b, #map_bcast_c],
-          iterator_types = ["parallel", "parallel", "reduction"]}
-         ins(%A, %B : tensor<?xf32>, tensor<?x?xf32>) outs(%Out : tensor<?x?xf32>) {
-   ^bb0(%in: f32, %in_0: f32, %out: f32):
-     %1 = arith.mulf %in, %in_0 : f32
-     %2 = arith.addf %out, %1 : f32
-     linalg.yield %2 : f32
-   } -> tensor<?x?xf32>
-   return %0 : tensor<?x?xf32>
+    {indexing_maps = [#map_bcast_a, #map_bcast_b, #map_bcast_c],
+    iterator_types = ["parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<?xf32>, tensor<?x?xf32>)
+    outs(%Out : tensor<?x?xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %1 = arith.mulf %in, %in_0 : f32
+    %2 = arith.addf %out, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: negative_matmul_broadcast_a
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.matmul
+// ALL-LABEL: op_matmul_broadcast_a
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
@@ -699,12 +965,13 @@ func.func @negative_matmul_broadcast_a(%A: tensor<?xf32>, %B: tensor<?x?xf32>,
 #map_bbcast_a = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
 #map_bbcast_b = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 #map_bbcast_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x8x16xf32>,
-                                              %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+func.func @op_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x8x16xf32>,
+                                       %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bbcast_a, #map_bbcast_b, #map_bbcast_c],
-            iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<16x8xf32>, tensor<2x8x16xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bbcast_a, #map_bbcast_b, #map_bbcast_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<16x8xf32>, tensor<2x8x16xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -713,9 +980,13 @@ func.func @negative_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: negative_batch_matmul_broadcast_a
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.batch_matmul
+// ALL-LABEL: op_batch_matmul_broadcast_a
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.batch_matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract
 
 // -----
 
@@ -723,12 +994,13 @@ func.func @negative_batch_matmul_broadcast_a(%A: tensor<16x8xf32>, %B: tensor<2x
 #map_bbcast2_a = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map_bbcast2_b = affine_map<(d0, d1, d2, d3) -> (d3)>
 #map_bbcast2_c = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-func.func @negative_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<8xf32>,
-                                              %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
+func.func @op_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<8xf32>,
+                                       %Out: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %0 = linalg.generic
-           {indexing_maps = [#map_bbcast2_a, #map_bbcast2_b, #map_bbcast2_c],
-            iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-           ins(%A, %B : tensor<2x16x8xf32>, tensor<8xf32>) outs(%Out : tensor<2x16x16xf32>) {
+    {indexing_maps = [#map_bbcast2_a, #map_bbcast2_b, #map_bbcast2_c],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
+    ins(%A, %B : tensor<2x16x8xf32>, tensor<8xf32>)
+    outs(%Out : tensor<2x16x16xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %1 = arith.mulf %in, %in_0 : f32
     %2 = arith.addf %out, %1 : f32
@@ -737,6 +1009,10 @@ func.func @negative_batch_matmul_broadcast_b(%A: tensor<2x16x8xf32>, %B: tensor<
   return %0 : tensor<2x16x16xf32>
 }
 
-// CHECK-LABEL: negative_batch_matmul_broadcast_b
-// CHECK: linalg.generic
-// CHECK-NOT: linalg.batch_matmul
+// ALL-LABEL: op_batch_matmul_broadcast_b
+
+// NAMED: linalg.generic
+// NAMED-NOT: linalg.batch_matmul
+
+// CATEGORY-NOT: linalg.generic
+// CATEGORY: linalg.contract