[flang-commits] [flang] [mlir] [mlir][Transforms] `GreedyPatternRewriteDriver`: Do not CSE constants during iterations (PR #75897)

Mon Dec 18 22:52:05 PST 2023

https://github.com/matthias-springer updated https://github.com/llvm/llvm-project/pull/75897

>From ef7badf671bb18407482d4a297ae0234d5437849 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm at google.com>
Date: Tue, 19 Dec 2023 12:17:51 +0900
Subject: [PATCH 1/2] [mlir] Require folders to produce Values of same type

This commit adds extra assertions to `OperationFolder` and `OpBuilder` to ensure that the types of the folded SSA values match with the result types of the op. There used to be checks that discard the folded results if the types do not match. This commit makes these checks stricter and turns them into assertions.

Discarding folded results with the wrong type can hide bugs in op folders. Two such bugs became apparent and are fixed with this change.

Note: The existing type checks were introduced in https://reviews.llvm.org/D95991.
---
 flang/lib/Optimizer/Dialect/FIROps.cpp      |  6 ++++--
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp      |  4 +++-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp    |  5 +----
 mlir/lib/IR/Builders.cpp                    |  5 ++---
 mlir/lib/Transforms/Utils/FoldUtils.cpp     |  6 ++----
 mlir/test/Transforms/test-canonicalize.mlir | 13 -------------
 mlir/test/Transforms/test-legalizer.mlir    | 10 ----------
 mlir/test/lib/Dialect/Test/TestDialect.cpp  |  4 ----
 mlir/test/lib/Dialect/Test/TestOps.td       |  7 -------
 9 files changed, 12 insertions(+), 48 deletions(-)

diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index ab1e9c0ec7adc8..6d62e470706e53 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -625,11 +625,13 @@ void fir::BoxAddrOp::build(mlir::OpBuilder &builder,
 mlir::OpFoldResult fir::BoxAddrOp::fold(FoldAdaptor adaptor) {
   if (auto *v = getVal().getDefiningOp()) {
     if (auto box = mlir::dyn_cast<fir::EmboxOp>(v)) {
-      if (!box.getSlice()) // Fold only if not sliced
+      // Fold only if not sliced
+      if (!box.getSlice() && box.getMemref().getType() == getType())
         return box.getMemref();
     }
     if (auto box = mlir::dyn_cast<fir::EmboxCharOp>(v))
-      return box.getMemref();
+      if (box.getMemref().getType() == getType())
+        return box.getMemref();
   }
   return {};
 }
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 56d5e0fed76185..ff72becc8dfa77 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1352,9 +1352,11 @@ OpFoldResult arith::TruncIOp::fold(FoldAdaptor adaptor) {
       setOperand(src);
       return getResult();
     }
+
     // trunci(zexti(a)) -> a
     // trunci(sexti(a)) -> a
-    return src;
+    if (srcType == dstType)
+      return src;
   }
 
   // trunci(trunci(a)) -> trunci(a))
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 540959b486db9c..ac9485326a32ed 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -1600,11 +1600,8 @@ static Value foldExtractFromBroadcast(ExtractOp extractOp) {
     return llvm::isa<VectorType>(type) ? llvm::cast<VectorType>(type).getRank()
                                        : 0;
   };
-  // If splat or broadcast from a scalar, just return the source scalar.
-  unsigned broadcastSrcRank = getRank(source.getType());
-  if (broadcastSrcRank == 0)
-    return source;
 
+  unsigned broadcastSrcRank = getRank(source.getType());
   unsigned extractResultRank = getRank(extractOp.getType());
   if (extractResultRank >= broadcastSrcRank)
     return Value();
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index 2cabfcd24d3559..c28cbe109c3ffd 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -491,9 +491,8 @@ LogicalResult OpBuilder::tryFold(Operation *op,
 
     // Normal values get pushed back directly.
     if (auto value = llvm::dyn_cast_if_present<Value>(std::get<0>(it))) {
-      if (value.getType() != expectedType)
-        return cleanupFailure();
-
+      assert(value.getType() == expectedType &&
+             "folder produced value of incorrect type");
       results.push_back(value);
       continue;
     }
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index 90ee5ba51de3ad..34b7117a035748 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -247,10 +247,8 @@ OperationFolder::processFoldResults(Operation *op,
 
     // Check if the result was an SSA value.
     if (auto repl = llvm::dyn_cast_if_present<Value>(foldResults[i])) {
-      if (repl.getType() != op->getResult(i).getType()) {
-        results.clear();
-        return failure();
-      }
+      assert(repl.getType() == op->getResult(i).getType() &&
+             "folder produced value of incorrect type");
       results.emplace_back(repl);
       continue;
     }
diff --git a/mlir/test/Transforms/test-canonicalize.mlir b/mlir/test/Transforms/test-canonicalize.mlir
index bc463fefe65342..4f0095ed7e8cf4 100644
--- a/mlir/test/Transforms/test-canonicalize.mlir
+++ b/mlir/test/Transforms/test-canonicalize.mlir
@@ -70,19 +70,6 @@ func.func @test_commutative_multi_cst(%arg0: i32, %arg1: i32) -> (i32, i32) {
   return %y, %z: i32, i32
 }
 
-// CHECK-LABEL: func @typemismatch
-
-func.func @typemismatch() -> i32 {
-  %c42 = arith.constant 42.0 : f32
-
-  // The "passthrough_fold" folder will naively return its operand, but we don't
-  // want to fold here because of the type mismatch.
-
-  // CHECK: "test.passthrough_fold"
-  %0 = "test.passthrough_fold"(%c42) : (f32) -> (i32)
-  return %0 : i32
-}
-
 // CHECK-LABEL: test_dialect_canonicalizer
 func.func @test_dialect_canonicalizer() -> (i32) {
   %0 = "test.dialect_canonicalizable"() : () -> (i32)
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 6897b6f95f0d05..d8cf6e4719cede 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -310,16 +310,6 @@ builtin.module {
 
 // -----
 
-// The "passthrough_fold" folder will naively return its operand, but we don't
-// want to fold here because of the type mismatch.
-func.func @typemismatch(%arg: f32) -> i32 {
-  // expected-remark at +1 {{op 'test.passthrough_fold' is not legalizable}}
-  %0 = "test.passthrough_fold"(%arg) : (f32) -> (i32)
-  "test.return"(%0) : (i32) -> ()
-}
-
-// -----
-
 // expected-remark @below {{applyPartialConversion failed}}
 module {
   func.func private @callee(%0 : f32) -> f32
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 21400a60e65321..a1b30705f16a98 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -542,10 +542,6 @@ OpFoldResult TestOpInPlaceFold::fold(FoldAdaptor adaptor) {
   return {};
 }
 
-OpFoldResult TestPassthroughFold::fold(FoldAdaptor adaptor) {
-  return getOperand();
-}
-
 OpFoldResult TestOpFoldWithFoldAdaptor::fold(FoldAdaptor adaptor) {
   int64_t sum = 0;
   if (auto value = dyn_cast_or_null<IntegerAttr>(adaptor.getOp()))
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 96f66c2ca06ecf..70ccc71883e3c1 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1363,13 +1363,6 @@ def TestOpFoldWithFoldAdaptor
   let hasFolder = 1;
 }
 
-// An op that always fold itself.
-def TestPassthroughFold : TEST_Op<"passthrough_fold"> {
-  let arguments = (ins AnyType:$op);
-  let results = (outs AnyType);
-  let hasFolder = 1;
-}
-
 def TestDialectCanonicalizerOp : TEST_Op<"dialect_canonicalizable"> {
   let arguments = (ins);
   let results = (outs I32);

>From d37d2fc9d5997edc0304f9c302eba2983f8b6ee5 Mon Sep 17 00:00:00 2001
From: Matthias Springer <springerm at google.com>
Date: Tue, 19 Dec 2023 15:05:49 +0900
Subject: [PATCH 2/2] [mlir][Transforms] `GreedyPatternRewriteDriver`: Do not
 CSE constants during iterations

The `GreedyPatternRewriteDriver` tries to iteratively fold ops and apply rewrite patterns to ops. It has special handling for constants: they are CSE'd and sometimes moved to parent regions to allow for additional CSE'ing. This happens in `OperationFolder`.

To allow for efficient CSE'ing, `OperationFolder` maintains an internal lookup data structure to find the existing constant ops with the same value for each `IsolatedFromAbove` region:
```c++
/// A mapping between an insertion region and the constants that have been
/// created within it.
DenseMap<Region *, ConstantMap> foldScopes;
```

Rewrite patterns are allowed to modify operations. In particular, they may move operations (including constants) from one region to another one. Such an IR rewrite can make the above lookup data structure inconsistent.

We encountered such a bug in a downstream project. This bug materialized in the form of an op that uses the result of a constant op from a different `IsolatedFromAbove` region (that is not accessible).

This commit changes the behavior of the `GreedyPatternRewriteDriver` such that `OperationFolder` is used to CSE constants at the beginning of each iteration (as the worklist is populated), but no longer during an iteration. `OperationFolder` is no longer used after populating the worklist, so we do not have to care about inconsistent state in the `OperationFolder` due to IR rewrites. The `GreedyPatternRewriteDriver` now performs the op folding by itself instead of calling `OperationFolder::tryToFold`.

This change changes the order of constant ops in test cases, but not the region in which they appear. All broken test cases were fixed by turning `CHECK` into `CHECK-DAG`.

Alternatives considered: The state of `OperationFolder` could be partially invalidated with every `notifyOperationModified` notification. That is more fragile than the solution in this commit because incorrect rewriter API usage can lead to missing notifications and hard-to-debug `IsolatedFromAbove` violations. (It did not fix the above mention bug in a downstream project, which could be due to incorrect rewriter API usage or due to another conceptual problem that I missed.) Moreover, ops are frequently getting modified during a greedy pattern rewrite, so we would likely keep invalidating large parts of the state of `OperationFolder` over and over.
---
 flang/test/Lower/array-temp.f90               | 54 +++++------
 .../Utils/GreedyPatternRewriteDriver.cpp      | 76 +++++++++++----
 .../VectorToArmSME/vector-to-arm-sme.mlir     | 10 +-
 .../VectorToLLVM/vector-to-llvm.mlir          |  8 +-
 .../Conversion/VectorToSCF/vector-to-scf.mlir | 30 +++---
 .../test/Dialect/ArmSME/arith-ops-to-sme.mlir | 16 ++--
 .../test/Dialect/LLVMIR/type-consistency.mlir | 58 +++++------
 mlir/test/Dialect/Linalg/loops.mlir           | 16 ++--
 .../Linalg/vectorization-with-patterns.mlir   |  6 +-
 .../Linalg/vectorize-tensor-extract.mlir      | 96 +++++++++----------
 .../Math/algebraic-simplification.mlir        | 28 +++---
 mlir/test/Dialect/Math/expand-math.mlir       | 24 ++---
 .../Math/polynomial-approximation.mlir        | 72 +++++++-------
 .../NVGPU/transform-create-async-groups.mlir  | 12 +--
 mlir/test/Dialect/SCF/loop-pipelining.mlir    |  6 +-
 mlir/test/Dialect/SparseTensor/sparse_1d.mlir |  8 +-
 .../Dialect/SparseTensor/sparse_affine.mlir   |  8 +-
 .../Dialect/SparseTensor/sparse_concat.mlir   |  8 +-
 .../Dialect/SparseTensor/sparse_storage.mlir  |  8 +-
 .../Tensor/fold-into-pack-and-unpack.mlir     | 16 ++--
 mlir/test/Dialect/Tosa/constant-op-fold.mlir  |  6 +-
 .../Tosa/tosa-decompose-depthwise.mlir        |  8 +-
 .../Tosa/tosa-decompose-transpose-conv.mlir   | 15 ++-
 .../vector-broadcast-lowering-transforms.mlir |  4 +-
 ...tract-to-matrix-intrinsics-transforms.mlir |  6 +-
 .../vector-mask-lowering-transforms.mlir      | 14 +--
 .../vector-multi-reduction-lowering.mlir      | 34 +++----
 .../vector-scalable-create-mask-lowering.mlir |  4 +-
 28 files changed, 342 insertions(+), 309 deletions(-)

diff --git a/flang/test/Lower/array-temp.f90 b/flang/test/Lower/array-temp.f90
index f8c2ec3e03c543..7d941188012a33 100644
--- a/flang/test/Lower/array-temp.f90
+++ b/flang/test/Lower/array-temp.f90
@@ -43,15 +43,15 @@ subroutine ss4(N)
 
 ! CHECK-LABEL: func @_QPss2(
 ! CHECK-SAME:               %arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:   %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
-! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
-! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
-! CHECK:   %[[C_27_i32:[-0-9a-z_]+]] = arith.constant 27 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
-! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
-! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
-! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
-! CHECK:   %[[C_0:[-0-9a-z_]+]] = arith.constant 0 : index
+! CHECK-DAG: %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
+! CHECK-DAG: %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
+! CHECK-DAG: %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
+! CHECK-DAG: %[[C_27_i32:[-0-9a-z_]+]] = arith.constant 27 : i32
+! CHECK-DAG: %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK-DAG: %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
+! CHECK-DAG: %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
+! CHECK-DAG: %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
+! CHECK-DAG: %[[C_0:[-0-9a-z_]+]] = arith.constant 0 : index
 ! CHECK:   %[[V_0:[0-9]+]] = fir.load %arg0 : !fir.ref<i32>
 ! CHECK:   %[[V_1:[0-9]+]] = fir.convert %[[V_0:[0-9]+]] : (i32) -> index
 ! CHECK:   %[[V_2:[0-9]+]] = arith.cmpi sgt, %[[V_1]], %[[C_0]] : index
@@ -137,15 +137,15 @@ subroutine ss4(N)
 
 ! CHECK-LABEL: func @_QPss3(
 ! CHECK-SAME:               %arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:   %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
-! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
-! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
-! CHECK:   %[[C_34_i32:[-0-9a-z_]+]] = arith.constant 34 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
-! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
-! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
-! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
-! CHECK:   %[[C_0:[-0-9a-z_]+]] = arith.constant 0 : index
+! CHECK-DAG: %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
+! CHECK-DAG: %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
+! CHECK-DAG: %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
+! CHECK-DAG: %[[C_34_i32:[-0-9a-z_]+]] = arith.constant 34 : i32
+! CHECK-DAG: %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK-DAG: %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
+! CHECK-DAG: %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
+! CHECK-DAG: %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
+! CHECK-DAG: %[[C_0:[-0-9a-z_]+]] = arith.constant 0 : index
 ! CHECK:   %[[V_0:[0-9]+]] = fir.load %arg0 : !fir.ref<i32>
 ! CHECK:   %[[V_1:[0-9]+]] = fir.convert %[[V_0:[0-9]+]] : (i32) -> index
 ! CHECK:   %[[V_2:[0-9]+]] = arith.cmpi sgt, %[[V_1]], %[[C_0]] : index
@@ -263,15 +263,15 @@ subroutine ss4(N)
 
 ! CHECK-LABEL: func @_QPss4(
 ! CHECK-SAME:               %arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
-! CHECK:   %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
-! CHECK:   %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
-! CHECK:   %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
-! CHECK:   %[[C_41_i32:[-0-9a-z_]+]] = arith.constant 41 : i32
-! CHECK:   %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
-! CHECK:   %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
-! CHECK:   %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
-! CHECK:   %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
-! CHECK:   %[[C_0:[-0-9a-z_]+]] = arith.constant 0 : index
+! CHECK-DAG: %[[C_2:[-0-9a-z_]+]] = arith.constant 2 : index
+! CHECK-DAG: %[[C_m1:[-0-9a-z_]+]] = arith.constant -1 : index
+! CHECK-DAG: %[[C_1:[-0-9a-z_]+]] = arith.constant 1 : index
+! CHECK-DAG: %[[C_41_i32:[-0-9a-z_]+]] = arith.constant 41 : i32
+! CHECK-DAG: %[[C_m1_i32:[-0-9a-z_]+]] = arith.constant -1 : i32
+! CHECK-DAG: %[[C_st:[-0-9a-z_]+]] = arith.constant 7.000000e+00 : f32
+! CHECK-DAG: %[[C_1_i32:[-0-9a-z_]+]] = arith.constant 1 : i32
+! CHECK-DAG: %[[C_st_0:[-0-9a-z_]+]] = arith.constant -2.000000e+00 : f32
+! CHECK-DAG: %[[C_0:[-0-9a-z_]+]] = arith.constant 0 : index
 ! CHECK:   %[[V_0:[0-9]+]] = fir.load %arg0 : !fir.ref<i32>
 ! CHECK:   %[[V_1:[0-9]+]] = fir.convert %[[V_0:[0-9]+]] : (i32) -> index
 ! CHECK:   %[[V_2:[0-9]+]] = arith.cmpi sgt, %[[V_1]], %[[C_0]] : index
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index 7decbce018a878..ee4f761d4c5c0d 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -313,9 +313,6 @@ class GreedyPatternRewriteDriver : public PatternRewriter,
   Worklist worklist;
 #endif // MLIR_GREEDY_REWRITE_RANDOMIZER_SEED
 
-  /// Non-pattern based folder for operations.
-  OperationFolder folder;
-
   /// Configuration information for how to simplify.
   const GreedyRewriteConfig config;
 
@@ -357,7 +354,7 @@ class GreedyPatternRewriteDriver : public PatternRewriter,
 GreedyPatternRewriteDriver::GreedyPatternRewriteDriver(
     MLIRContext *ctx, const FrozenRewritePatternSet &patterns,
     const GreedyRewriteConfig &config)
-    : PatternRewriter(ctx), folder(ctx, this), config(config), matcher(patterns)
+    : PatternRewriter(ctx), config(config), matcher(patterns)
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       // clang-format off
       , debugFingerPrints(this)
@@ -428,11 +425,47 @@ bool GreedyPatternRewriteDriver::processWorklist() {
       continue;
     }
 
-    // Try to fold this op.
-    if (succeeded(folder.tryToFold(op))) {
-      LLVM_DEBUG(logResultWithLine("success", "operation was folded"));
-      changed = true;
-      continue;
+    // Try to fold this op. Do not fold constant ops. That would lead to an
+    // infinite folding loop, as every constant op would be folded to an
+    // Attribute and then immediately be rematerialized as a constant op, which
+    // is then put on the worklist.
+    if (!op->hasTrait<OpTrait::ConstantLike>()) {
+      SmallVector<OpFoldResult> foldResults;
+      if (succeeded(op->fold(foldResults))) {
+        LLVM_DEBUG(logResultWithLine("success", "operation was folded"));
+        changed = true;
+        if (foldResults.empty()) {
+          // Op was modified in-place.
+          notifyOperationModified(op);
+          continue;
+        }
+
+        // Op results can be replaced with `foldResults`.
+        assert(foldResults.size() == op->getNumResults() &&
+               "folder produced incorrect number of results");
+        OpBuilder::InsertionGuard g(*this);
+        setInsertionPoint(op);
+        SmallVector<Value> replacements;
+        for (auto [ofr, resultType] :
+             llvm::zip_equal(foldResults, op->getResultTypes())) {
+          if (auto value = ofr.dyn_cast<Value>()) {
+            assert(value.getType() == resultType &&
+                   "folder produced value of incorrect type");
+            replacements.push_back(value);
+            continue;
+          }
+          // Materialize Attributes as SSA values.
+          Operation *constOp = op->getDialect()->materializeConstant(
+              *this, ofr.get<Attribute>(), resultType, op->getLoc());
+          assert(constOp->hasTrait<OpTrait::ConstantLike>() &&
+                 "materializeConstant produced op that is not a ConstantLike");
+          assert(constOp->getResultTypes()[0] == resultType &&
+                 "materializeConstant produced incorrect result type");
+          replacements.push_back(constOp->getResult(0));
+        }
+        replaceOp(op, replacements);
+        continue;
+      }
     }
 
     // Try to match one of the patterns. The rewriter is automatically
@@ -567,7 +600,6 @@ void GreedyPatternRewriteDriver::notifyOperationRemoved(Operation *op) {
 
   addOperandsToWorklist(op->getOperands());
   worklist.remove(op);
-  folder.notifyRemoval(op);
 
   if (config.strictMode != GreedyRewriteStrictness::AnyOp)
     strictModeFilteredOps.erase(op);
@@ -647,16 +679,6 @@ class GreedyPatternRewriteIteration
 } // namespace
 
 LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
-  auto insertKnownConstant = [&](Operation *op) {
-    // Check for existing constants when populating the worklist. This avoids
-    // accidentally reversing the constant order during processing.
-    Attribute constValue;
-    if (matchPattern(op, m_Constant(&constValue)))
-      if (!folder.insertKnownConstant(op, constValue))
-        return true;
-    return false;
-  };
-
   bool continueRewrites = false;
   int64_t iteration = 0;
   MLIRContext *ctx = getContext();
@@ -666,8 +688,22 @@ LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
         config.maxIterations != GreedyRewriteConfig::kNoLimit)
       break;
 
+    // New iteration: start with an empty worklist.
     worklist.clear();
 
+    // `OperationFolder` CSE's constant ops (and may move them into parents
+    // regions to enable more aggressive CSE'ing).
+    OperationFolder folder(getContext(), this);
+    auto insertKnownConstant = [&](Operation *op) {
+      // Check for existing constants when populating the worklist. This avoids
+      // accidentally reversing the constant order during processing.
+      Attribute constValue;
+      if (matchPattern(op, m_Constant(&constValue)))
+        if (!folder.insertKnownConstant(op, constValue))
+          return true;
+      return false;
+    };
+
     if (!config.useTopDownTraversal) {
       // Add operations to the worklist in postorder.
       region.walk([&](Operation *op) {
diff --git a/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir b/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir
index 6783263c184961..d3f02c6288a240 100644
--- a/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir
+++ b/mlir/test/Conversion/VectorToArmSME/vector-to-arm-sme.mlir
@@ -309,9 +309,9 @@ func.func @transfer_write_2d_transpose_with_mask_bf16(%vector : vector<[8]x[8]xb
 
 // CHECK-LABEL:   func.func @broadcast_vec2d_from_i32(
 // CHECK-SAME:                                        %[[SRC:.*]]: i32) {
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[C4:.*]] = arith.constant 4 : index
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[SRC_1D:.*]] = vector.broadcast %[[SRC]] : i32 to vector<[4]xi32>
 // CHECK: %[[INIT_TILE:.*]] = arm_sme.get_tile : vector<[4]x[4]xi32>
 // CHECK: %[[VSCALE:.*]] = vector.vscale
@@ -393,8 +393,8 @@ func.func @splat_vec2d_from_f16(%arg0: f16) {
 
 // CHECK-LABEL:   func.func @transpose_i8(
 // CHECK-SAME:                            %[[TILE:.*]]: vector<[16]x[16]xi8>)
-// CHECK:           %[[C16:.*]] = arith.constant 16 : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[VSCALE:.*]] = vector.vscale
 // CHECK:           %[[MIN_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C16]] : index
 // CHECK:           %[[NUM_TILE_SLICES:.*]] = memref.alloca(%[[MIN_TILE_SLICES]], %[[MIN_TILE_SLICES]]) : memref<?x?xi8>
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 012d30d96799f2..90f134d57a3c13 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -196,10 +196,10 @@ func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32>
 }
 // CHECK-LABEL: @broadcast_vec3d_from_vec1d(
 // CHECK-SAME:  %[[A:.*]]: vector<2xf32>)
-// CHECK:       %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32>
-// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>>
-// CHECK:       %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32>
-// CHECK:       %[[T6:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x2xf32> to !llvm.array<4 x array<3 x vector<2xf32>>>
+// CHECK-DAG:   %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32>
+// CHECK-DAG:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>>
+// CHECK-DAG:   %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32>
+// CHECK-DAG:   %[[T6:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x2xf32> to !llvm.array<4 x array<3 x vector<2xf32>>>
 
 // CHECK:       %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][0] : !llvm.array<3 x vector<2xf32>>
 // CHECK:       %[[T4:.*]] = llvm.insertvalue %[[A]], %[[T3]][1] : !llvm.array<3 x vector<2xf32>>
diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
index ad78f0c945b24d..97eda0672c0299 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir
@@ -533,9 +533,9 @@ func.func @transfer_write_scalable(%arg0: memref<?xf32, strided<[?], offset: ?>>
 }
 
 // CHECK-SAME:      %[[ARG_0:.*]]: memref<?xf32, strided<[?], offset: ?>>,
-// CHECK:           %[[C_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C_16:.*]] = arith.constant 16 : index
-// CHECK:           %[[STEP:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C_0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C_16:.*]] = arith.constant 16 : index
+// CHECK-DAG:       %[[STEP:.*]] = arith.constant 1 : index
 // CHECK:           %[[MASK_VEC:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} : vector<[16]xi32>
 // CHECK:           %[[VSCALE:.*]] = vector.vscale
 // CHECK:           %[[UB:.*]] = arith.muli %[[VSCALE]], %[[C_16]] : index
@@ -556,8 +556,8 @@ func.func @vector_print_vector_0d(%arg0: vector<f32>) {
 }
 // CHECK-LABEL:   func.func @vector_print_vector_0d(
 // CHECK-SAME:                                      %[[VEC:.*]]: vector<f32>) {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32>
 // CHECK:           vector.print punctuation <open>
 // CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
@@ -581,9 +581,9 @@ func.func @vector_print_vector(%arg0: vector<2x2xf32>) {
 }
 // CHECK-LABEL:   func.func @vector_print_vector(
 // CHECK-SAME:                                   %[[VEC:.*]]: vector<2x2xf32>) {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C2:.*]] = arith.constant 2 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<2x2xf32> to vector<4xf32>
 // CHECK:           vector.print punctuation <open>
 // CHECK:           scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
@@ -650,10 +650,10 @@ func.func @transfer_read_array_of_scalable(%arg0: memref<3x?xf32>) -> vector<3x[
 }
 // CHECK-LABEL:   func.func @transfer_read_array_of_scalable(
 // CHECK-SAME:                                               %[[ARG:.*]]: memref<3x?xf32>) -> vector<3x[4]xf32> {
-// CHECK:           %[[PADDING:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C3:.*]] = arith.constant 3 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[PADDING:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
 // CHECK:           %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
 // CHECK:           %[[DIM_SIZE:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<3x?xf32>
@@ -684,9 +684,9 @@ func.func @transfer_write_array_of_scalable(%vec: vector<3x[4]xf32>, %arg0: memr
 // CHECK-LABEL:   func.func @transfer_write_array_of_scalable(
 // CHECK-SAME:                                                %[[VEC:.*]]: vector<3x[4]xf32>,
 // CHECK-SAME:                                                %[[MEMREF:.*]]: memref<3x?xf32>) {
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C3:.*]] = arith.constant 3 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
 // CHECK:           %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
 // CHECK:           %[[DIM_SIZE:.*]] = memref.dim %[[MEMREF]], %[[C1]] : memref<3x?xf32>
diff --git a/mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir b/mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir
index ae2d0f40f03af5..e51f2485dadbcc 100644
--- a/mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir
+++ b/mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir
@@ -91,10 +91,10 @@ func.func @arith_constant_dense_2d_zero_f64() {
 // -----
 
 // CHECK-LABEL: func.func @arith_constant_dense_2d_nonzero_i8() {
-// CHECK: %[[C2_SPLAT:.*]] = arith.constant dense<2> : vector<[16]xi8>
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[C16:.*]] = arith.constant 16 : index
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C2_SPLAT:.*]] = arith.constant dense<2> : vector<[16]xi8>
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[INIT_TILE:.*]] = arm_sme.get_tile : vector<[16]x[16]xi8>
 // CHECK: %[[VSCALE:.*]] = vector.vscale
 // CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C16]] : index
@@ -111,10 +111,10 @@ func.func @arith_constant_dense_2d_nonzero_i8() {
 // -----
 
 // CHECK-LABEL: func.func @arith_constant_dense_2d_nonzero_f64() {
-// CHECK: %[[C2_SPLAT:.*]] = arith.constant dense<2.000000e+00> : vector<[2]xf64>
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
-// CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C2_SPLAT:.*]] = arith.constant dense<2.000000e+00> : vector<[2]xf64>
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[INIT_TILE:.*]] = arm_sme.get_tile : vector<[2]x[2]xf64>
 // CHECK: %[[VSCALE:.*]] = vector.vscale
 // CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[VSCALE]], %[[C2]] : index
diff --git a/mlir/test/Dialect/LLVMIR/type-consistency.mlir b/mlir/test/Dialect/LLVMIR/type-consistency.mlir
index 3a1ab924ebdacb..021151b929d8e2 100644
--- a/mlir/test/Dialect/LLVMIR/type-consistency.mlir
+++ b/mlir/test/Dialect/LLVMIR/type-consistency.mlir
@@ -168,8 +168,8 @@ llvm.func @no_crash_on_negative_gep_index() {
 // CHECK-LABEL: llvm.func @coalesced_store_ints
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @coalesced_store_ints(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
 
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32)>
@@ -193,8 +193,8 @@ llvm.func @coalesced_store_ints(%arg: i64) {
 // CHECK-LABEL: llvm.func @coalesced_store_ints_offset
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @coalesced_store_ints_offset(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, i32, i32)>
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, i32, i32)> : (i32) -> !llvm.ptr
@@ -218,8 +218,8 @@ llvm.func @coalesced_store_ints_offset(%arg: i64) {
 // CHECK-LABEL: llvm.func @coalesced_store_floats
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @coalesced_store_floats(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
   %0 = llvm.mlir.constant(1 : i32) : i32
 
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (f32, f32)>
@@ -292,9 +292,9 @@ llvm.func @coalesced_store_past_end(%arg: i64) {
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @coalesced_store_packed_struct(%arg: i64) {
   %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-  // CHECK: %[[CST48:.*]] = llvm.mlir.constant(48 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
+  // CHECK-DAG: %[[CST48:.*]] = llvm.mlir.constant(48 : i64) : i64
 
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", packed (i16, i32, i16)>
   %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i16, i32, i16)> : (i32) -> !llvm.ptr
@@ -320,10 +320,10 @@ llvm.func @coalesced_store_packed_struct(%arg: i64) {
 // CHECK-LABEL: llvm.func @vector_write_split
 // CHECK-SAME: %[[ARG:.*]]: vector<4xi32>
 llvm.func @vector_write_split(%arg: vector<4xi32>) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
+  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32, i32)>
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32, i32)> : (i32) -> !llvm.ptr
@@ -354,10 +354,10 @@ llvm.func @vector_write_split(%arg: vector<4xi32>) {
 // CHECK-LABEL: llvm.func @vector_write_split_offset
 // CHECK-SAME: %[[ARG:.*]]: vector<4xi32>
 llvm.func @vector_write_split_offset(%arg: vector<4xi32>) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
+  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, i32, i32, i32, i32)> : (i32) -> !llvm.ptr
@@ -470,8 +470,8 @@ llvm.func @gep_split(%arg: i64) {
 // CHECK-LABEL: llvm.func @coalesced_store_ints_subaggregate
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @coalesced_store_ints_subaggregate(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, struct<(i32, i32)>)> : (i32) -> !llvm.ptr
@@ -513,8 +513,8 @@ llvm.func @gep_result_ptr_type_dynamic(%arg: i64) {
 // CHECK-LABEL: llvm.func @overlapping_int_aggregate_store
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @overlapping_int_aggregate_store(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
 
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
@@ -548,10 +548,10 @@ llvm.func @overlapping_int_aggregate_store(%arg: i64) {
 // CHECK-LABEL: llvm.func @overlapping_vector_aggregate_store
 // CHECK-SAME: %[[ARG:.*]]: vector<4xi16>
 llvm.func @overlapping_vector_aggregate_store(%arg: vector<4 x i16>) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
+  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
 
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
@@ -586,8 +586,8 @@ llvm.func @overlapping_vector_aggregate_store(%arg: vector<4 x i16>) {
 // CHECK-LABEL: llvm.func @partially_overlapping_aggregate_store
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @partially_overlapping_aggregate_store(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
 
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)>
@@ -644,8 +644,8 @@ llvm.func @undesirable_overlapping_aggregate_store(%arg: i64) {
 // CHECK-LABEL: llvm.func @coalesced_store_ints_array
 // CHECK-SAME: %[[ARG:.*]]: i64
 llvm.func @coalesced_store_ints_array(%arg: i64) {
-  // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
+  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
 
   %0 = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x i32>
diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
index 8c13422fd63833..b818170a8e7974 100644
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -693,8 +693,8 @@ func.func @conv1d_no_symbols(%in : memref<?xf32>, %filter : memref<?xf32>, %out
 //  CHECK-SAME: %[[arg0:[a-zA-Z0-9]+]]: memref<?xf32>
 //  CHECK-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref<?xf32>
 //  CHECK-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?xf32>
-//       CHECK: %[[c0:.*]] = arith.constant 0 : index
-//       CHECK: %[[c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
 //       CHECK: %[[dim0:.*]] = memref.dim %[[arg1]], %[[c0]] : memref<?xf32>
 //       CHECK: %[[dim1:.*]] = memref.dim %[[arg2]], %[[c0]] : memref<?xf32>
 //       CHECK: scf.for %[[b:.*]] = %[[c0]] to %[[dim1]] step %[[c1]] {
@@ -711,8 +711,8 @@ func.func @conv1d_no_symbols(%in : memref<?xf32>, %filter : memref<?xf32>, %out
 //  CHECKPARALLEL-SAME: %[[arg0:[a-zA-Z0-9]+]]: memref<?xf32>
 //  CHECKPARALLEL-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref<?xf32>
 //  CHECKPARALLEL-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?xf32>
-//       CHECKPARALLEL: %[[c0:.*]] = arith.constant 0 : index
-//       CHECKPARALLEL: %[[c1:.*]] = arith.constant 1 : index
+//   CHECKPARALLEL-DAG: %[[c0:.*]] = arith.constant 0 : index
+//   CHECKPARALLEL-DAG: %[[c1:.*]] = arith.constant 1 : index
 //       CHECKPARALLEL: %[[dim0:.*]] = memref.dim %[[arg1]], %[[c0]] : memref<?xf32>
 //       CHECKPARALLEL: %[[dim1:.*]] = memref.dim %[[arg2]], %[[c0]] : memref<?xf32>
 //       CHECKPARALLEL: scf.parallel (%[[b:.*]]) = (%[[c0]]) to (%[[dim1]]) step (%[[c1]]) {
@@ -735,8 +735,8 @@ func.func @conv2d_no_symbols(%in : memref<?x?xf32>, %filter : memref<?x?xf32>, %
 //  CHECK-SAME: %[[arg0:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //  CHECK-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //  CHECK-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?x?xf32>
-//       CHECK: %[[c0:.*]] = arith.constant 0 : index
-//       CHECK: %[[c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
 //       CHECK: %[[dim0:.*]] = memref.dim %[[arg1]], %[[c0]] : memref<?x?xf32>
 //       CHECK: %[[dim1:.*]] = memref.dim %[[arg1]], %[[c1]] : memref<?x?xf32>
 //       CHECK: %[[dim2:.*]] = memref.dim %[[arg2]], %[[c0]] : memref<?x?xf32>
@@ -760,8 +760,8 @@ func.func @conv2d_no_symbols(%in : memref<?x?xf32>, %filter : memref<?x?xf32>, %
 //  CHECKPARALLEL-SAME: %[[arg0:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //  CHECKPARALLEL-SAME: %[[arg1:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //  CHECKPARALLEL-SAME: %[[arg2:[a-zA-Z0-9]+]]: memref<?x?xf32>
-//       CHECKPARALLEL: %[[c0:.*]] = arith.constant 0 : index
-//       CHECKPARALLEL: %[[c1:.*]] = arith.constant 1 : index
+//   CHECKPARALLEL-DAG: %[[c0:.*]] = arith.constant 0 : index
+//   CHECKPARALLEL-DAG: %[[c1:.*]] = arith.constant 1 : index
 //       CHECKPARALLEL: %[[dim0:.*]] = memref.dim %[[arg1]], %[[c0]] : memref<?x?xf32>
 //       CHECKPARALLEL: %[[dim1:.*]] = memref.dim %[[arg1]], %[[c1]] : memref<?x?xf32>
 //       CHECKPARALLEL: %[[dim2:.*]] = memref.dim %[[arg2]], %[[c0]] : memref<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index a43cc2df88cadf..58d4b21ea2dd90 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -335,9 +335,9 @@ func.func @vectorize_affine_apply(%arg0: tensor<5xf32>, %arg3: index) -> tensor<
 // CHECK-LABEL:  func.func @vectorize_affine_apply
 // CHECK-SAME: %arg0: tensor<5xf32>
 // CHECK-SAME: %[[ARG1:.*]]: index
-// CHECK:   %[[CST:.*]] = arith.constant dense<[123, 124, 125, 126, 127]> : vector<5xindex>
-// CHECK:   %[[CST_0:.*]] = arith.constant dense<1> : vector<5xindex>
-// CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<[123, 124, 125, 126, 127]> : vector<5xindex>
+// CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<1> : vector<5xindex>
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK:   %[[EMPTY:.*]] = tensor.empty() : tensor<5xi32>
 // CHECK:   %[[BCAST:.*]] = vector.broadcast %[[ARG1]] : index to vector<5xindex>
 // CHECK:   %[[ADDI_1:.*]] = arith.addi %[[BCAST]], %[[CST]] : vector<5xindex>
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 3fd4fcd536624c..96953c234a0873 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -17,9 +17,9 @@ func.func @vectorize_1d_tensor_extract(%arg0: tensor<3xf32>, %arg1: tensor<4x3xi
 // CHECK-LABEL: func.func @vectorize_1d_tensor_extract
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<3xf32>
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<4x3xi32>
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
-// CHECK: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
+// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
 // CHECK: %[[V0:.*]] = vector.transfer_read %[[ARG1]]
 // CHECK: %[[CAST:.*]] = arith.index_cast %[[V0]]
 // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST]]
@@ -58,7 +58,7 @@ func.func @vectorize_nd_tensor_extract_constant_idx(%arg0: tensor<3x3xf32>, %arg
 // CHECK-SAME:      %[[ARG_1:.*]]: tensor<1x1x3xf32>) -> tensor<1x1x3xf32> {
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:       arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[C0_f32_2:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[READ:.*]] = vector.transfer_read  %[[ARG_0]][%[[C1]], %[[C2]]], %[[C0_f32]] {in_bounds = [true, true, true], permutation_map = #[[$MAP]]} : tensor<3x3xf32>, vector<1x1x3xf32>
 // CHECK:           %[[C0_4:.*]] = arith.constant 0 : index
@@ -93,10 +93,10 @@ func.func @vectorize_nd_tensor_extract_transfer_read_basic(%arg0: tensor<3x3x3xf
 // CHECK-LABEL: func.func @vectorize_nd_tensor_extract_transfer_read_basic
 // CHECK-SAME: %[[ARG0:.*]]: tensor<3x3x3xf32>
 // CHECK-SAME: %[[ARG1:.*]]: tensor<1x1x3xf32>
-// CHECK:   %[[CST:.*]] = arith.constant dense<0> : vector<1x1x3xindex>
-// CHECK:   %[[C0_i32:.*]] = arith.constant 0 : i32
-// CHECK:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK:   %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1x3xindex>
+// CHECK-DAG: %[[C0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:   %[[IDX_VEC0:.*]] = vector.shape_cast %[[CST]] : vector<1x1x3xindex> to vector<3xindex>
 // CHECK:   %[[IDX1:.*]] = vector.extractelement %[[IDX_VEC0]][%[[C0_i32]] : i32] : vector<3xindex>
 // CHECK:   %[[IDX_VEC:.*]] = vector.shape_cast %[[CST]] : vector<1x1x3xindex> to vector<3xindex>
@@ -139,11 +139,11 @@ func.func @vectorize_nd_tensor_extract_transfer_read_complex(%6: tensor<45x80x16
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<45x80x16xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index,
 // CHECK-SAME:      %[[VAL_5:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK:           %[[VAL_6:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_9:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_10:.*]] = arith.constant 79 : index
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 79 : index
 // CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : index
 // CHECK:           %[[VAL_12:.*]] = vector.broadcast %[[VAL_11]] : index to vector<1x4xindex>
 // CHECK:           %[[VAL_13:.*]] = vector.broadcast %[[VAL_3]] : index to vector<4xindex>
@@ -191,11 +191,11 @@ func.func @vectorize_nd_tensor_extract_index_from_tensor(%arg0: tensor<3x3xf32>,
 // CHECK-SAME: %[[ARG2:arg2]]: tensor<4x3xi32>
 // CHECK-SAME: %[[ARG3:.*]]: tensor<4x7x2xf32>
 // CHECK-SAME: %[[ARG4:.*]]: tensor<4x7x3x2xf32>
-// CHECK:    %[[C0:.*]] = arith.constant 0 : index
-// CHECK:    %[[C0_i32:.*]] = arith.constant 0 : i32
-// CHECK:    %[[CST:.*]] = arith.constant dense<3> : vector<7x2x4x3xindex>
-// CHECK:    %[[CST_1:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
-// CHECK:    %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C0_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[CST:.*]] = arith.constant dense<3> : vector<7x2x4x3xindex>
+// CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<true> : vector<4x7x3x2xi1>
+// CHECK-DAG: %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<4x7x3x2xf32>
 // CHECK:    %[[V0:.*]] = vector.transfer_read %[[ARG1]][%[[C0]], %[[C0]]], %[[C0_i32]] {in_bounds = [true, true]} : tensor<4x3xi32>, vector<4x3xi32>
 // CHECK:    %[[V1:.*]] = vector.transfer_read %[[ARG2]][%[[C0]], %[[C0]]], %[[C0_i32]] {in_bounds = [true, true]} : tensor<4x3xi32>, vector<4x3xi32>
 // CHECK:    %[[CAST:.*]] = arith.index_cast %[[V0]] : vector<4x3xi32> to vector<4x3xindex>
@@ -239,12 +239,12 @@ func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_contiguous_and_gather(
 // CHECK-SAME:                    %[[VAL_0:.*]]: tensor<6xf32>
 // CHECK-SAME:                    %[[VAL_1:.*]]: tensor<5xi32>
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_4:.*]] = arith.constant dense<0> : vector<5xindex>
-// CHECK:           %[[VAL_5:.*]] = arith.constant dense<5> : vector<5xindex>
-// CHECK:           %[[VAL_6:.*]] = arith.constant dense<true> : vector<5xi1>
-// CHECK:           %[[VAL_7:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<0> : vector<5xindex>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<5> : vector<5xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<true> : vector<5xi1>
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
 // CHECK:           %[[VAL_8:.*]] = tensor.empty() : tensor<5xf32>
 // CHECK:           %[[VAL_9:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor<5xi32>, vector<5xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.index_cast %[[VAL_9]] : vector<5xi32> to vector<5xindex>
@@ -285,11 +285,11 @@ func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<8
 // CHECK-SAME:                                                                        %[[VAL_0:.*]]: tensor<80x16xf32>,
 // CHECK-SAME:                                                                        %[[VAL_1:.*]]: index,
 // CHECK-SAME:                                                                        %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_7:.*]] = arith.constant 79 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 79 : index
 // CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
 // CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
 // CHECK:           %[[VAL_10:.*]] = vector.extractelement %[[VAL_9]]{{\[}}%[[VAL_4]] : i32] : vector<4xindex>
@@ -373,11 +373,11 @@ func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16
 // CHECK-SAME:                                                                    %[[VAL_0:.*]]: tensor<80x16xf32>,
 // CHECK-SAME:                                                                    %[[VAL_1:.*]]: index,
 // CHECK-SAME:                                                                    %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
-// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
-// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex>
 // CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
 // CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
 // CHECK:           %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : vector<4xindex> to vector<1x4xindex>
@@ -418,11 +418,11 @@ func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_gather(
 // CHECK-SAME:                                                             %[[VAL_0:.*]]: tensor<80x16xf32>,
 // CHECK-SAME:                                                             %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
-// CHECK:           %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex>
-// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
-// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex>
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_7:.*]] = vector.broadcast %[[VAL_2]] : vector<4xindex> to vector<1x4xindex>
 // CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : vector<1x4xindex>
 // CHECK:           %[[VAL_9:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_8]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
@@ -461,10 +461,10 @@ func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(
 // CHECK-SAME:                                                                 %[[VAL_0:.*]]: tensor<80x16xf32>,
 // CHECK-SAME:                                                                 %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant dense<16> : vector<1x4xindex>
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<16> : vector<1x4xindex>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[VAL_6:.*]] = vector.shape_cast %[[VAL_2]] : vector<1x4xindex> to vector<4xindex>
 // CHECK:           %[[VAL_7:.*]] = vector.extractelement %[[VAL_6]]{{\[}}%[[VAL_3]] : i32] : vector<4xindex>
 // CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
@@ -499,11 +499,11 @@ func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1:
 // CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_block_arg(
 // CHECK-SAME:                                                     %[[VAL_0:.*]]: tensor<5x6xf32>,
 // CHECK-SAME:                                                     %[[VAL_1:.*]]: tensor<5xindex>) -> tensor<5xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex>
-// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<5xi1>
-// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
-// CHECK:           %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex>
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex>
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant dense<true> : vector<5xi1>
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex>
 // CHECK:           %[[VAL_7:.*]] = tensor.empty() : tensor<5xf32>
 // CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_2]] {in_bounds = [true]} : tensor<5xindex>, vector<5xindex>
 // CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_6]] : vector<5xindex>
diff --git a/mlir/test/Dialect/Math/algebraic-simplification.mlir b/mlir/test/Dialect/Math/algebraic-simplification.mlir
index 21c9f7a8e7f17b..a97ecc52a17e97 100644
--- a/mlir/test/Dialect/Math/algebraic-simplification.mlir
+++ b/mlir/test/Dialect/Math/algebraic-simplification.mlir
@@ -38,8 +38,8 @@ func.func @pow_cube(%arg0: f32, %arg1 : vector<4xf32>) -> (f32, vector<4xf32>) {
 
 // CHECK-LABEL: @pow_recip
 func.func @pow_recip(%arg0: f32, %arg1 : vector<4xf32>) -> (f32, vector<4xf32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1.0{{.*}} : f32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1.0{{.*}}> : vector<4xf32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1.0{{.*}} : f32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1.0{{.*}}> : vector<4xf32>
   // CHECK: %[[SCALAR:.*]] = arith.divf %[[CST_S]], %arg0
   // CHECK: %[[VECTOR:.*]] = arith.divf %[[CST_V]], %arg1
   // CHECK: return %[[SCALAR]], %[[VECTOR]]
@@ -110,8 +110,8 @@ func.func @ipowi_zero_exp(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi3
 // CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
 // CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) {
 func.func @ipowi_exp_one(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
   // CHECK: %[[SCALAR:.*]] = arith.divsi %[[CST_S]], %[[ARG0]]
   // CHECK: %[[VECTOR:.*]] = arith.divsi %[[CST_V]], %[[ARG1]]
   // CHECK: return %[[ARG0]], %[[ARG1]], %[[SCALAR]], %[[VECTOR]]
@@ -131,8 +131,8 @@ func.func @ipowi_exp_one(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32
 // CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
 // CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) {
 func.func @ipowi_exp_two(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
   // CHECK: %[[SCALAR0:.*]] = arith.muli %[[ARG0]], %[[ARG0]]
   // CHECK: %[[VECTOR0:.*]] = arith.muli %[[ARG1]], %[[ARG1]]
   // CHECK: %[[SCALAR1:.*]] = arith.divsi %[[CST_S]], %[[ARG0]]
@@ -156,8 +156,8 @@ func.func @ipowi_exp_two(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32
 // CHECK-SAME: %[[ARG1:.+]]: vector<4xi32>
 // CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) {
 func.func @ipowi_exp_three(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1 : i32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32>
   // CHECK: %[[SMUL0:.*]] = arith.muli %[[ARG0]], %[[ARG0]]
   // CHECK: %[[SCALAR0:.*]] = arith.muli %[[SMUL0]], %[[ARG0]]
   // CHECK: %[[VMUL0:.*]] = arith.muli %[[ARG1]], %[[ARG1]]
@@ -200,8 +200,8 @@ func.func @fpowi_zero_exp(%arg0: f32, %arg1: vector<4xf32>) -> (f32, vector<4xf3
 // CHECK-SAME: %[[ARG1:.+]]: vector<4xf32>
 // CHECK-SAME: -> (f32, vector<4xf32>, f32, vector<4xf32>) {
 func.func @fpowi_exp_one(%arg0: f32, %arg1: vector<4xf32>) -> (f32, vector<4xf32>, f32, vector<4xf32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1.000000e+00 : f32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1.000000e+00> : vector<4xf32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1.000000e+00 : f32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1.000000e+00> : vector<4xf32>
   // CHECK: %[[SCALAR:.*]] = arith.divf %[[CST_S]], %[[ARG0]]
   // CHECK: %[[VECTOR:.*]] = arith.divf %[[CST_V]], %[[ARG1]]
   // CHECK: return %[[ARG0]], %[[ARG1]], %[[SCALAR]], %[[VECTOR]]
@@ -221,8 +221,8 @@ func.func @fpowi_exp_one(%arg0: f32, %arg1: vector<4xf32>) -> (f32, vector<4xf32
 // CHECK-SAME: %[[ARG1:.+]]: vector<4xf32>
 // CHECK-SAME: -> (f32, vector<4xf32>, f32, vector<4xf32>) {
 func.func @fpowi_exp_two(%arg0: f32, %arg1: vector<4xf32>) -> (f32, vector<4xf32>, f32, vector<4xf32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1.000000e+00 : f32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1.000000e+00> : vector<4xf32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1.000000e+00 : f32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1.000000e+00> : vector<4xf32>
   // CHECK: %[[SCALAR0:.*]] = arith.mulf %[[ARG0]], %[[ARG0]]
   // CHECK: %[[VECTOR0:.*]] = arith.mulf %[[ARG1]], %[[ARG1]]
   // CHECK: %[[SCALAR1:.*]] = arith.divf %[[CST_S]], %[[ARG0]]
@@ -246,8 +246,8 @@ func.func @fpowi_exp_two(%arg0: f32, %arg1: vector<4xf32>) -> (f32, vector<4xf32
 // CHECK-SAME: %[[ARG1:.+]]: vector<4xf32>
 // CHECK-SAME: -> (f32, vector<4xf32>, f32, vector<4xf32>) {
 func.func @fpowi_exp_three(%arg0: f32, %arg1: vector<4xf32>) -> (f32, vector<4xf32>, f32, vector<4xf32>) {
-  // CHECK: %[[CST_S:.*]] = arith.constant 1.000000e+00 : f32
-  // CHECK: %[[CST_V:.*]] = arith.constant dense<1.000000e+00> : vector<4xf32>
+  // CHECK-DAG: %[[CST_S:.*]] = arith.constant 1.000000e+00 : f32
+  // CHECK-DAG: %[[CST_V:.*]] = arith.constant dense<1.000000e+00> : vector<4xf32>
   // CHECK: %[[SMUL0:.*]] = arith.mulf %[[ARG0]], %[[ARG0]]
   // CHECK: %[[SCALAR0:.*]] = arith.mulf %[[SMUL0]], %[[ARG0]]
   // CHECK: %[[VMUL0:.*]] = arith.mulf %[[ARG1]], %[[ARG1]]
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index 3e1c3462fc8fc6..6ee65b085dad1b 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -65,18 +65,18 @@ func.func @ctlz(%arg: i32) -> i32 {
 
 // CHECK-LABEL: @ctlz
 // CHECK-SAME: %[[ARG0:.+]]: i32
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0
-// CHECK-DAG: %[[C16:.+]] = arith.constant 16
-// CHECK-DAG: %[[C65535:.+]] = arith.constant 65535
-// CHECK-DAG: %[[C8:.+]] = arith.constant 8
-// CHECK-DAG: %[[C16777215:.+]] = arith.constant 16777215
-// CHECK-DAG: %[[C4:.+]] = arith.constant 4
-// CHECK-DAG: %[[C268435455:.+]] = arith.constant 268435455
-// CHECK-DAG: %[[C2:.+]] = arith.constant 2
-// CHECK-DAG: %[[C1073741823:.+]] = arith.constant 1073741823
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1
-// CHECK-DAG: %[[C2147483647:.+]] = arith.constant 2147483647
-// CHECK-DAG: %[[C32:.+]] = arith.constant 32
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : i32
+// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C65535:.+]] = arith.constant 65535 : i32
+// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : i32
+// CHECK-DAG: %[[C16777215:.+]] = arith.constant 16777215 : i32
+// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : i32
+// CHECK-DAG: %[[C268435455:.+]] = arith.constant 268435455 : i32
+// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : i32
+// CHECK-DAG: %[[C1073741823:.+]] = arith.constant 1073741823 : i32
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32
+// CHECK-DAG: %[[C2147483647:.+]] = arith.constant 2147483647 : i32
+// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : i32
 
 // CHECK: %[[PRED:.+]] = arith.cmpi ule, %[[ARG0]], %[[C65535]]
 // CHECK: %[[SHL:.+]] = arith.shli %[[ARG0]], %[[C16]]
diff --git a/mlir/test/Dialect/Math/polynomial-approximation.mlir b/mlir/test/Dialect/Math/polynomial-approximation.mlir
index 6743998d6162c4..834a7dc0af66d6 100644
--- a/mlir/test/Dialect/Math/polynomial-approximation.mlir
+++ b/mlir/test/Dialect/Math/polynomial-approximation.mlir
@@ -279,28 +279,28 @@ func.func @expm1_vector(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
 
 // CHECK-LABEL:   func @log_scalar(
 // CHECK-SAME:                             %[[X:.*]]: f32) -> f32 {
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
-// CHECK:           %[[VAL_3:.*]] = arith.constant -5.000000e-01 : f32
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1.17549435E-38 : f32
-// CHECK:           %[[VAL_5:.*]] = arith.constant 0xFF800000 : f32
-// CHECK:           %[[VAL_6:.*]] = arith.constant 0x7F800000 : f32
-// CHECK:           %[[VAL_7:.*]] = arith.constant 0x7FC00000 : f32
-// CHECK:           %[[VAL_8:.*]] = arith.constant 0.707106769 : f32
-// CHECK:           %[[VAL_9:.*]] = arith.constant 0.0703768358 : f32
-// CHECK:           %[[VAL_10:.*]] = arith.constant -0.115146101 : f32
-// CHECK:           %[[VAL_11:.*]] = arith.constant 0.116769984 : f32
-// CHECK:           %[[VAL_12:.*]] = arith.constant -0.12420141 : f32
-// CHECK:           %[[VAL_13:.*]] = arith.constant 0.142493233 : f32
-// CHECK:           %[[VAL_14:.*]] = arith.constant -0.166680574 : f32
-// CHECK:           %[[VAL_15:.*]] = arith.constant 0.200007141 : f32
-// CHECK:           %[[VAL_16:.*]] = arith.constant -0.24999994 : f32
-// CHECK:           %[[VAL_17:.*]] = arith.constant 0.333333313 : f32
-// CHECK:           %[[VAL_18:.*]] = arith.constant 1.260000e+02 : f32
-// CHECK:           %[[VAL_19:.*]] = arith.constant -2139095041 : i32
-// CHECK:           %[[VAL_20:.*]] = arith.constant 1056964608 : i32
-// CHECK:           %[[VAL_21:.*]] = arith.constant 23 : i32
-// CHECK:           %[[VAL_22:.*]] = arith.constant 0.693147182 : f32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant -5.000000e-01 : f32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1.17549435E-38 : f32
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0x7F800000 : f32
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0x7FC00000 : f32
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0.707106769 : f32
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 0.0703768358 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant -0.115146101 : f32
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 0.116769984 : f32
+// CHECK-DAG:       %[[VAL_12:.*]] = arith.constant -0.12420141 : f32
+// CHECK-DAG:       %[[VAL_13:.*]] = arith.constant 0.142493233 : f32
+// CHECK-DAG:       %[[VAL_14:.*]] = arith.constant -0.166680574 : f32
+// CHECK-DAG:       %[[VAL_15:.*]] = arith.constant 0.200007141 : f32
+// CHECK-DAG:       %[[VAL_16:.*]] = arith.constant -0.24999994 : f32
+// CHECK-DAG:       %[[VAL_17:.*]] = arith.constant 0.333333313 : f32
+// CHECK-DAG:       %[[VAL_18:.*]] = arith.constant 1.260000e+02 : f32
+// CHECK-DAG:       %[[VAL_19:.*]] = arith.constant -2139095041 : i32
+// CHECK-DAG:       %[[VAL_20:.*]] = arith.constant 1056964608 : i32
+// CHECK-DAG:       %[[VAL_21:.*]] = arith.constant 23 : i32
+// CHECK-DAG:       %[[VAL_22:.*]] = arith.constant 0.693147182 : f32
 // CHECK:           %[[VAL_23:.*]] = arith.cmpf ugt, %[[X]], %[[VAL_4]] : f32
 // CHECK:           %[[VAL_24:.*]] = arith.select %[[VAL_23]], %[[X]], %[[VAL_4]] : f32
 // CHECK-NOT:       frexp
@@ -417,20 +417,20 @@ func.func @log1p_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
 
 // CHECK-LABEL:   func @tanh_scalar(
 // CHECK-SAME:                      %[[VAL_0:.*]]: f32) -> f32 {
-// CHECK:           %[[VAL_1:.*]] = arith.constant -7.99881172 : f32
-// CHECK:           %[[VAL_2:.*]] = arith.constant 7.99881172 : f32
-// CHECK:           %[[VAL_3:.*]] = arith.constant 4.000000e-04 : f32
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0.00489352457 : f32
-// CHECK:           %[[VAL_5:.*]] = arith.constant 6.37261954E-4 : f32
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1.48572235E-5 : f32
-// CHECK:           %[[VAL_7:.*]] = arith.constant 5.12229725E-8 : f32
-// CHECK:           %[[VAL_8:.*]] = arith.constant -8.60467184E-11 : f32
-// CHECK:           %[[VAL_9:.*]] = arith.constant 2.00018794E-13 : f32
-// CHECK:           %[[VAL_10:.*]] = arith.constant -2.76076837E-16 : f32
-// CHECK:           %[[VAL_11:.*]] = arith.constant 0.00489352504 : f32
-// CHECK:           %[[VAL_12:.*]] = arith.constant 0.00226843474 : f32
-// CHECK:           %[[VAL_13:.*]] = arith.constant 1.18534706E-4 : f32
-// CHECK:           %[[VAL_14:.*]] = arith.constant 1.19825836E-6 : f32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant -7.99881172 : f32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 7.99881172 : f32
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 4.000000e-04 : f32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0.00489352457 : f32
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 6.37261954E-4 : f32
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1.48572235E-5 : f32
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 5.12229725E-8 : f32
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant -8.60467184E-11 : f32
+// CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 2.00018794E-13 : f32
+// CHECK-DAG:       %[[VAL_10:.*]] = arith.constant -2.76076837E-16 : f32
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 0.00489352504 : f32
+// CHECK-DAG:       %[[VAL_12:.*]] = arith.constant 0.00226843474 : f32
+// CHECK-DAG:       %[[VAL_13:.*]] = arith.constant 1.18534706E-4 : f32
+// CHECK-DAG:       %[[VAL_14:.*]] = arith.constant 1.19825836E-6 : f32
 // CHECK:           %[[VAL_15:.*]] = arith.cmpf ult, %[[VAL_0]], %[[VAL_2]] : f32
 // CHECK:           %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_0]], %[[VAL_2]] : f32
 // CHECK:           %[[VAL_17:.*]] = arith.cmpf ugt, %[[VAL_16]], %[[VAL_1]] : f32
diff --git a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
index e1e4111adca88c..8290001c458567 100644
--- a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
@@ -169,9 +169,9 @@ builtin.module {
   // CHECK-LABEL: @read_2d_with_mask(
   //  CHECK-SAME:     %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[a:.*]]: memref<1024x1024xf32>
   func.func @read_2d_with_mask(%sz0: index, %sz1: index, %a: memref<1024x1024xf32>) {
-    // CHECK: %[[c0:.*]] = arith.constant 0 : index
-    // CHECK: %[[c1:.*]] = arith.constant 1 : index
-    // CHECK: %[[c2:.*]] = arith.constant 2 : index
+    // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+    // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
     %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.000000e+00 : f32
@@ -215,9 +215,9 @@ builtin.module {
   // CHECK-LABEL: @read_3d_with_mask(
   //  CHECK-SAME:     %[[sz0:.*]]: index, %[[sz1:.*]]: index, %[[sz2:.*]]: index, %[[a:.*]]: memref<1024x1024x1024xf32>
   func.func @read_3d_with_mask(%sz0: index, %sz1: index, %sz2: index, %a: memref<1024x1024x1024xf32>) {
-    // CHECK: %[[c0:.*]] = arith.constant 0 : index
-    // CHECK: %[[c1:.*]] = arith.constant 1 : index
-    // CHECK: %[[c2:.*]] = arith.constant 2 : index
+    // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+    // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
     %0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.000000e+00 : f32
diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir
index a18c850c3f05f1..33290d2db31d66 100644
--- a/mlir/test/Dialect/SCF/loop-pipelining.mlir
+++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir
@@ -547,9 +547,9 @@ func.func @backedge_same_stage(%A: memref<?xf32>) -> f32 {
 // -----
 
 // CHECK: @pipeline_op_with_region(%[[ARG0:.+]]: memref<?xf32>, %[[ARG1:.+]]: memref<?xf32>, %[[ARG2:.+]]: memref<?xf32>, %[[CF:.*]]: f32) {
-// CHECK:   %[[C0:.+]] = arith.constant 0 :
-// CHECK:   %[[C3:.+]] = arith.constant 3 :
-// CHECK:   %[[C1:.+]] = arith.constant 1 :
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 :
+// CHECK-DAG: %[[C3:.+]] = arith.constant 3 :
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 :
 // CHECK:   %[[APRO:.+]] = memref.alloc() :
 // CHECK:   %[[BPRO:.+]] = memref.alloc() :
 // CHECK:   %[[ASV0:.+]] = memref.subview %[[ARG0]][%[[C0]]] [8] [1] :
diff --git a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
index 8f96f33794879b..fcc221660353a1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
@@ -45,10 +45,10 @@ func.func @add_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>)
 // CHECK-LABEL:   func @add_d_init(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse{{[0-9]*}}>,
 // CHECK-SAME:      %[[VAL_1:.*]]: f32) -> tensor<32xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_INITTENSOR:.*]] = tensor.empty() : tensor<32xf32>
 // CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
 // CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_INITTENSOR]] : memref<32xf32>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
index aa75e20460f51f..886b21fa975679 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
@@ -58,10 +58,10 @@ func.func @mul_inv_dense1d(%arga: tensor<32xf32, #SpVec>,
 // CHECK-LABEL:   func.func @mul_inv_enc_dense1d(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32xf32, #sparse{{[0-9]*}}>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4xf32, #sparse{{[0-9]*}}>) -> tensor<32xf32, #sparse{{[0-9]*}}> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant 32 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 32 : index
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 3 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_6:.*]] = tensor.empty() : tensor<32xf32, #sparse{{[0-9]*}}>
 // CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
 // CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<4xf32, #sparse{{[0-9]*}}> to memref<?xf32>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
index a323fa85d2fd17..bbf0b7c7c341d1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
@@ -10,10 +10,10 @@
 //  CHECK-SAME:  %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse>
 //  CHECK-SAME:  %[[TMP_arg1:.*]]: tensor<3x4xf64, #sparse>
 //  CHECK-SAME:  %[[TMP_arg2:.*]]: tensor<4x4xf64, #sparse>
-//       CHECK:  %[[TMP_c0:.*]] = arith.constant 0 : index
-//       CHECK:  %[[TMP_c1:.*]] = arith.constant 1 : index
-//       CHECK:  %[[TMP_c5:.*]] = arith.constant 5 : index
-//       CHECK:  %[[TMP_c2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:  %[[TMP_c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:  %[[TMP_c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:  %[[TMP_c5:.*]] = arith.constant 5 : index
+//   CHECK-DAG:  %[[TMP_c2:.*]] = arith.constant 2 : index
 //       CHECK:  %[[TMP_0:.*]] = bufferization.alloc_tensor() : tensor<9x4xf64, #sparse>
 //       CHECK:  %[[TMP_1:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse>
 //       CHECK:  %[[TMP_2:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 0 : index} : tensor<2x4xf64, #sparse>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_storage.mlir b/mlir/test/Dialect/SparseTensor/sparse_storage.mlir
index a8cc056139e49e..cd55ba31a6bce7 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_storage.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_storage.mlir
@@ -23,8 +23,8 @@
 }
 
 // CHECK-LABEL: func @mul64(
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
 // CHECK: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi64>
 // CHECK: %[[B0:.*]] = arith.index_cast %[[P0]] : i64 to index
 // CHECK: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi64>
@@ -49,8 +49,8 @@ func.func @mul64(%arga: tensor<32xf64, #SparseVector64>, %argb: tensor<32xf64>,
 }
 
 // CHECK-LABEL: func @mul32(
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
 // CHECK: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
 // CHECK: %[[Z0:.*]] = arith.extui %[[P0]] : i32 to i64
 // CHECK: %[[B0:.*]] = arith.index_cast %[[Z0]] : i64 to index
diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
index ca4eb4ff679445..e9e3ca9c0087f6 100644
--- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
@@ -261,8 +261,8 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56
 }
 //      CHECK: func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<56x?x?x64xf32>)
-//      CHECK:   %[[c1:.+]] = arith.constant 1 : index
-//      CHECK:   %[[c2:.+]] = arith.constant 2 : index
+//  CHECK-DAG:   %[[c1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[c2:.+]] = arith.constant 2 : index
 //      CHECK:   %[[dim:.+]] = tensor.dim %[[ARG0]], %[[c1]] : tensor<56x?x?x64xf32>
 //      CHECK:   %[[dim_0:.+]] = tensor.dim %[[ARG0]], %[[c2]] : tensor<56x?x?x64xf32>
 //      CHECK:   %[[INIT:.+]] = tensor.empty(%[[dim_0]], %[[dim]]) : tensor<?x?x56x2x32xf32>
@@ -295,8 +295,8 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0:
 //      CHECK: module {
 //      CHECK:   func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(
 // CHECK-SAME:   %[[ARG0:.+]]: tensor<56x?x?x128xf32>) 
-//      CHECK:     %[[c1:.+]] = arith.constant 1 : index
-//      CHECK:     %[[c2:.+]] = arith.constant 2 : index
+//  CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
 //      CHECK:     %[[dim:.+]] = tensor.dim %[[ARG0]], %[[c1]] : tensor<56x?x?x128xf32>
 //      CHECK:     %[[dim_0:.+]] = tensor.dim %[[ARG0]], %[[c2]] : tensor<56x?x?x128xf32>
 //      CHECK:     %[[mapped_dim1:.+]] = affine.apply #[[map:.+]]()[%[[dim]]]
@@ -330,10 +330,10 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_s
 // CHECK-SAME:   %[[PACK_DEST:.+]]: tensor<?x?x?x?x?x?x?xf32>, %[[TRANSPOSE_DEST:.+]]: tensor<?x?x?x?x?x?x?xf32>,
 // CHECK-SAME:   %[[ARG1:.+]]: index, %[[ARG2:.+]]: index,
 // CHECK-SAME:   %[[ARG3:.+]]: index) 
-//      CHECK:     %[[c0:.+]] = arith.constant 0 : index
-//      CHECK:     %[[c1:.+]] = arith.constant 1 : index
-//      CHECK:     %[[c2:.+]] = arith.constant 2 : index
-//      CHECK:     %[[c3:.+]] = arith.constant 3 : index
+//  CHECK-DAG:     %[[c0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
+//  CHECK-DAG:     %[[c3:.+]] = arith.constant 3 : index
 //      CHECK:     %[[dim:.+]] = tensor.dim %[[ARG0]], %[[c0]] : tensor<?x?x?x?xf32>
 //      CHECK:     %[[dim_0:.+]] = tensor.dim %[[ARG0]], %[[c1]] : tensor<?x?x?x?xf32>
 //      CHECK:     %[[dim_1:.+]] = tensor.dim %[[ARG0]], %[[c2]] : tensor<?x?x?x?xf32>
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index 612e99f198515a..27ca3ae3c21be6 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -1080,9 +1080,9 @@ func.func @reduce_sum_constant_aggressive() -> tensor<1x3xi32> {
 
 func.func @reduce_sum_constant_aggressive() -> tensor<2x3xi32> {
   // AGGRESIVE-LABEL:     func.func @reduce_sum_constant_aggressive() -> tensor<2x3xi32> {
-  // AGGRESIVE:           %[[VAL_0:.*]] = "tosa.const"() <{value = dense<2> : tensor<1x2x3xi32>}> : () -> tensor<1x2x3xi32>
-  // AGGRESIVE:           %[[VAL_1:.*]] = "tosa.const"() <{value = dense<1> : tensor<2x2x3xi32>}> : () -> tensor<2x2x3xi32>
-  // AGGRESIVE:           %[[VAL_2:.*]] = "tosa.const"() <{value = dense<2> : tensor<2x3xi32>}> : () -> tensor<2x3xi32>
+  // AGGRESIVE-DAG:       %[[VAL_0:.*]] = "tosa.const"() <{value = dense<2> : tensor<1x2x3xi32>}> : () -> tensor<1x2x3xi32>
+  // AGGRESIVE-DAG:       %[[VAL_1:.*]] = "tosa.const"() <{value = dense<1> : tensor<2x2x3xi32>}> : () -> tensor<2x2x3xi32>
+  // AGGRESIVE-DAG:       %[[VAL_2:.*]] = "tosa.const"() <{value = dense<2> : tensor<2x3xi32>}> : () -> tensor<2x3xi32>
   // AGGRESIVE:           %[[VAL_3:.*]] = tosa.argmax %[[VAL_0]] {axis = 0 : i32} : (tensor<1x2x3xi32>) -> tensor<2x3xi32>
   // AGGRESIVE:           %[[VAL_4:.*]] = tosa.argmax %[[VAL_1]] {axis = 0 : i32} : (tensor<2x2x3xi32>) -> tensor<2x3xi32>
   // AGGRESIVE:           %[[VAL_5:.*]] = tosa.add %[[VAL_3]], %[[VAL_2]] : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
index c86bf5d056f85e..2224bf3f57b255 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-depthwise.mlir
@@ -26,8 +26,8 @@ func.func @depthwise_conv2d_as_mul(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1
 
 // CHECK-LABEL: @depthwise_conv2d_as_mul_q
 func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<1x1x2x3xi8>, %arg2: tensor<6xi32>) -> tensor<4x10x10x6xi32> {
-  // CHECK: %[[iZp:.+]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1x1x1xi32>}
-  // CHECK: %[[wZp:.+]] = "tosa.const"() <{value = dense<11> : tensor<1x1x1x1xi32>}
+  // CHECK-DAG: %[[iZp:.+]] = "tosa.const"() <{value = dense<7> : tensor<1x1x1x1x1xi32>}
+  // CHECK-DAG: %[[wZp:.+]] = "tosa.const"() <{value = dense<11> : tensor<1x1x1x1xi32>}
   // CHECK: %[[rIn:.+]] = tosa.reshape %arg0 {new_shape = array<i64: 4, 10, 10, 2, 1>}
   // CHECK: %[[cIn:.+]] = tosa.cast %[[rIn]] : (tensor<4x10x10x2x1xi8>) -> tensor<4x10x10x2x1xi32>
   // CHECK: %[[cWe:.+]] = tosa.cast %arg1 : (tensor<1x1x2x3xi8>) -> tensor<1x1x2x3xi32>
@@ -46,8 +46,8 @@ func.func @depthwise_conv2d_as_mul_q(%arg0: tensor<4x10x10x2xi8>, %arg1: tensor<
 
 // CHECK-LABEL: @depthwise_conv2d_as_mul_padded
 func.func @depthwise_conv2d_as_mul_padded(%arg0: tensor<4x10x10x2xf32>, %arg1: tensor<1x1x2x3xf32>, %arg2: tensor<6xf32>) -> tensor<4x12x12x6xf32> {
-  // CHECK: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0], [0, 0]]> : tensor<5x2xi64>}
-  // CHECK: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK-DAG: %[[pad:.+]] = "tosa.const"() <{value = dense<{{\[\[}}0, 0], [1, 1], [1, 1], [0, 0], [0, 0]]> : tensor<5x2xi64>}
+  // CHECK-DAG: %[[zero:.+]] = "tosa.const"() <{value = dense<0.000000e+00> : tensor<f32>}
   // CHECK: %[[reIn:.+]] = tosa.reshape %arg0 {new_shape = array<i64: 4, 10, 10, 2, 1>}
   // CHECK: %[[padded:.+]] = tosa.pad %[[reIn]], %[[pad]], %[[zero]] : (tensor<4x10x10x2x1xf32>, tensor<5x2xi64>, tensor<f32>) -> tensor<4x12x12x2x1xf32>
   // CHECK: %[[reArg1:.+]] = tosa.reshape %arg1 {new_shape = array<i64: 1, 1, 1, 2, 3>}
diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
index 5f9fe68c578925..1f2bb3fb9a3657 100644
--- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir
@@ -108,15 +108,12 @@ func.func @transpose_conv2d_strided_quantized(%arg0: tensor<2x17x15x3xi8>, %arg1
 
 // CHECK-LABEL: @transpose_conv2d_strided_overpad
 func.func @transpose_conv2d_strided_overpad(%arg0 : tensor<1x16x1x1xi8>, %arg1 : tensor<1x2x1x1xi8>, %arg2 : tensor<1xi32>) -> (tensor<1x19x2x1xi32>) {
-  // CHECK: %[[WEIGHT_PAD:.+]] = "tosa.const"()
-  // CHECK-SAME{literal}: value = dense<[[0, 0], [0, 0], [0, 1], [0, 0]]> : tensor<4x2xi32>
-  // CHECK: %[[WEIGHT_PERMS:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
-  // CHECK: %[[INPUT_PAD:.+]] = "tosa.const"()
-  // CHECK-SAME{literal}: value = dense<[[0, 0], [1, 1], [0, 0], [0, 0]]> : tensor<4x2xi32>}
-  // CHECK: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<2xi32>}
-  // CHECK: %[[RESULT_PERMS:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-  // CHECK: %[[RESULT_PAD:.+]] = "tosa.const"()
-  // CHECK-SAME{literal}: value = dense<[[0, 0], [2, 0], [0, 0], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[WEIGHT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [0, 0], [0, 1], [0, 0]]> : tensor<4x2xi32>
+  // CHECK-DAG: %[[WEIGHT_PERMS:.+]] = "tosa.const"() <{value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
+  // CHECK-DAG: %[[INPUT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [1, 1], [0, 0], [0, 0]]> : tensor<4x2xi32>}
+  // CHECK-DAG: %[[ZERO:.+]] = "tosa.const"() <{value = dense<0> : tensor<2xi32>}
+  // CHECK-DAG: %[[RESULT_PERMS:.+]] = "tosa.const"() <{value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
+  // CHECK-DAG: %[[RESULT_PAD:.+]] = "tosa.const"() <{value = dense<{{\[}}[0, 0], [2, 0], [0, 0], [0, 0]]> : tensor<4x2xi32>}
   // CHECK: %[[PAD_WEIGHT:.+]] = tosa.pad %arg1, %[[WEIGHT_PAD]] {quantization_info = #tosa.pad_quant<input_zp = 93>}
   // CHECK: %[[RESHAPE_WEIGHT_0:.+]] = tosa.reshape %[[PAD_WEIGHT]] {new_shape = array<i64: 1, 2, 1, 1, 2, 1>}
   // CHECK: %[[TRANSPOSE_WEIGHT:.+]] = tosa.transpose %[[RESHAPE_WEIGHT_0]], %[[WEIGHT_PERMS]]
diff --git a/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir
index 8774a7513c9a24..4a5ea439134cfd 100644
--- a/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir
@@ -54,8 +54,8 @@ func.func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> {
 
 // CHECK-LABEL: func @broadcast_vec3d_from_vec1d
 // CHECK-SAME: %[[A:.*0]]: vector<2xf32>
-// CHECK:      %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32>
-// CHECK:      %[[C1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32>
+// CHECK-DAG:  %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32>
+// CHECK-DAG:  %[[C1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32>
 // CHECK:      %[[T0:.*]] = vector.insert %[[A]], %[[C0]] [0] : vector<2xf32> into vector<3x2xf32>
 // CHECK:      %[[T1:.*]] = vector.insert %[[A]], %[[T0]] [1] : vector<2xf32> into vector<3x2xf32>
 // CHECK:      %[[T2:.*]] = vector.insert %[[A]], %[[T1]] [2] : vector<2xf32> into vector<3x2xf32>
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
index 3c5539d1aea6ed..78cf82e1ab6c1a 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir
@@ -14,9 +14,9 @@
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
 // CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
-//      CHECK:  %[[vcst:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32>
-//      CHECK:  %[[vcst_0:.*]] = arith.constant dense<0.000000e+00> : vector<12xf32>
-//      CHECK:  %[[vcst_1:.*]] = arith.constant dense<0.000000e+00> : vector<2x3xf32>
+//  CHECK-DAG:  %[[vcst:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32>
+//  CHECK-DAG:  %[[vcst_0:.*]] = arith.constant dense<0.000000e+00> : vector<12xf32>
+//  CHECK-DAG:  %[[vcst_1:.*]] = arith.constant dense<0.000000e+00> : vector<2x3xf32>
 //      CHECK:  %[[a0:.*]] = vector.extract %[[A]][0] : vector<4xf32> from vector<2x4xf32>
 //      CHECK:  %[[a1:.*]] = vector.insert_strided_slice %[[a0]], %[[vcst]] {offsets = [0], strides = [1]} : vector<4xf32> into vector<8xf32>
 //      CHECK:  %[[a2:.*]] = vector.extract %[[A]][1] : vector<4xf32> from vector<2x4xf32>
diff --git a/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
index f74af8eda0e6f4..7838543e151bed 100644
--- a/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-mask-lowering-transforms.mlir
@@ -22,9 +22,9 @@ func.func @genbool_2d() -> vector<4x4xi1> {
 }
 
 // CHECK-LABEL: func @genbool_3d
-// CHECK: %[[C1:.*]] = arith.constant dense<[true, true, true, false]> : vector<4xi1>
-// CHECK: %[[C2:.*]] = arith.constant dense<false> : vector<3x4xi1>
-// CHECK: %[[C3:.*]] = arith.constant dense<false> : vector<2x3x4xi1>
+// CHECK-DAG: %[[C1:.*]] = arith.constant dense<[true, true, true, false]> : vector<4xi1>
+// CHECK-DAG: %[[C2:.*]] = arith.constant dense<false> : vector<3x4xi1>
+// CHECK-DAG: %[[C3:.*]] = arith.constant dense<false> : vector<2x3x4xi1>
 // CHECK: %[[T0:.*]] = vector.insert %[[C1]], %[[C2]] [0] : vector<4xi1> into vector<3x4xi1>
 // CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C3]] [0] : vector<3x4xi1> into vector<2x3x4xi1>
 // CHECK: return %[[T1]] : vector<2x3x4xi1>
@@ -47,10 +47,10 @@ func.func @genbool_var_1d(%arg0: index) -> vector<3xi1> {
 // CHECK-LABEL: func @genbool_var_2d(
 // CHECK-SAME: %[[A:.*0]]: index,
 // CHECK-SAME: %[[B:.*1]]: index)
-// CHECK:      %[[C1:.*]] = arith.constant dense<false> : vector<3xi1>
-// CHECK:      %[[C2:.*]] = arith.constant dense<false> : vector<2x3xi1>
-// CHECK:      %[[c0:.*]] = arith.constant 0 : index
-// CHECK:      %[[c1:.*]] = arith.constant 1 : index
+// CHECK-DAG:  %[[C1:.*]] = arith.constant dense<false> : vector<3xi1>
+// CHECK-DAG:  %[[C2:.*]] = arith.constant dense<false> : vector<2x3xi1>
+// CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
+// CHECK-DAG:  %[[c1:.*]] = arith.constant 1 : index
 // CHECK:      %[[T0:.*]] = vector.create_mask %[[B]] : vector<3xi1>
 // CHECK:      %[[T1:.*]] = arith.cmpi sgt, %[[A]], %[[c0]] : index
 // CHECK:      %[[T2:.*]] = arith.select %[[T1]], %[[T0]], %[[C1]] : vector<3xi1>
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
index 17cdda65fb2ca9..6e06ba1bb14b6e 100644
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
@@ -6,9 +6,9 @@ func.func @vector_multi_reduction(%arg0: vector<2x4xf32>, %acc: vector<2xf32>) -
 }
 // CHECK-LABEL: func @vector_multi_reduction
 //  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xf32>, %[[ACC:.*]]: vector<2xf32>)
-//       CHECK:       %[[RESULT_VEC_0:.+]] = arith.constant dense<{{.*}}> : vector<2xf32>
-//       CHECK:       %[[C0:.+]] = arith.constant 0 : index
-//       CHECK:       %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:       %[[RESULT_VEC_0:.+]] = arith.constant dense<{{.*}}> : vector<2xf32>
+//   CHECK-DAG:       %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:       %[[C1:.+]] = arith.constant 1 : index
 //       CHECK:       %[[V0:.+]] = vector.extract %[[INPUT]][0]
 //       CHECK:       %[[ACC0:.+]] = vector.extract %[[ACC]][0]
 //       CHECK:       %[[RV0:.+]] = vector.reduction <mul>, %[[V0]], %[[ACC0]] : vector<4xf32> into f32
@@ -37,7 +37,7 @@ func.func @vector_reduction_inner(%arg0: vector<2x3x4x5xi32>, %acc: vector<2x3xi
 }
 // CHECK-LABEL: func @vector_reduction_inner
 //  CHECK-SAME:   %[[INPUT:.+]]: vector<2x3x4x5xi32>, %[[ACC:.*]]: vector<2x3xi32>
-//       CHECK:       %[[FLAT_RESULT_VEC_0:.+]] = arith.constant dense<0> : vector<6xi32>
+//   CHECK-DAG:       %[[FLAT_RESULT_VEC_0:.+]] = arith.constant dense<0> : vector<6xi32>
 //   CHECK-DAG:       %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:       %[[C1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:       %[[C2:.+]] = arith.constant 2 : index
@@ -90,15 +90,15 @@ func.func @vector_multi_reduction_ordering(%arg0: vector<3x2x4xf32>, %acc: vecto
 }
 // CHECK-LABEL: func @vector_multi_reduction_ordering
 //  CHECK-SAME:   %[[INPUT:.+]]: vector<3x2x4xf32>, %[[ACC:.*]]: vector<2x4xf32>)
-//       CHECK:       %[[RESULT_VEC_0:.+]] = arith.constant dense<{{.*}}> : vector<8xf32>
-//       CHECK:       %[[C0:.+]] = arith.constant 0 : index
-//       CHECK:       %[[C1:.+]] = arith.constant 1 : index
-//       CHECK:       %[[C2:.+]] = arith.constant 2 : index
-//       CHECK:       %[[C3:.+]] = arith.constant 3 : index
-//       CHECK:       %[[C4:.+]] = arith.constant 4 : index
-//       CHECK:       %[[C5:.+]] = arith.constant 5 : index
-//       CHECK:       %[[C6:.+]] = arith.constant 6 : index
-//       CHECK:       %[[C7:.+]] = arith.constant 7 : index
+//   CHECK-DAG:       %[[RESULT_VEC_0:.+]] = arith.constant dense<{{.*}}> : vector<8xf32>
+//   CHECK-DAG:       %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:       %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:       %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:       %[[C3:.+]] = arith.constant 3 : index
+//   CHECK-DAG:       %[[C4:.+]] = arith.constant 4 : index
+//   CHECK-DAG:       %[[C5:.+]] = arith.constant 5 : index
+//   CHECK-DAG:       %[[C6:.+]] = arith.constant 6 : index
+//   CHECK-DAG:       %[[C7:.+]] = arith.constant 7 : index
 //       CHECK:       %[[TRANSPOSED_INPUT:.+]] = vector.transpose %[[INPUT]], [1, 2, 0] : vector<3x2x4xf32> to vector<2x4x3xf32>
 //       CHECK:       %[[V0:.+]] = vector.extract %[[TRANSPOSED_INPUT]][0, 0]
 //       CHECK:       %[[ACC0:.+]] = vector.extract %[[ACC]][0, 0] : f32 from vector<2x4xf32>
@@ -256,10 +256,10 @@ func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>
 // CHECK-LABEL:   func.func private @scalable_dims(
 // CHECK-SAME:                                     %[[VAL_0:.*]]: vector<8x[4]x2xf32>,
 // CHECK-SAME:                                     %[[VAL_1:.*]]: vector<8x[4]xf32>) -> vector<8x[4]xf32> {
-// CHECK:           %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<[32]xf32>
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_34:.*]] = arith.constant 31 : index
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<0.000000e+00> : vector<[32]xf32>
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_34:.*]] = arith.constant 31 : index
 
 // CHECK:           %[[VAL_35:.*]] = vector.extract %[[VAL_0]][0, 0] : vector<2xf32> from vector<8x[4]x2xf32>
 // CHECK:           %[[VAL_36:.*]] = vector.extract %[[VAL_1]][0, 0] : f32 from vector<8x[4]xf32>
diff --git a/mlir/test/Dialect/Vector/vector-scalable-create-mask-lowering.mlir b/mlir/test/Dialect/Vector/vector-scalable-create-mask-lowering.mlir
index 95c77a43155e7e..772388b6490c6e 100644
--- a/mlir/test/Dialect/Vector/vector-scalable-create-mask-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-scalable-create-mask-lowering.mlir
@@ -2,8 +2,8 @@
 
 // CHECK-LABEL: func.func @create_mask_2d_trailing_scalable(
 // CHECK-SAME: %[[arg:.*]]: index) -> vector<3x[4]xi1> {
-// CHECK-NEXT: %[[zero_mask_1d:.*]] = arith.constant dense<false> : vector<[4]xi1>
-// CHECK-NEXT: %[[zero_mask_2d:.*]] = arith.constant dense<false> : vector<3x[4]xi1>
+// CHECK-DAG: %[[zero_mask_1d:.*]] = arith.constant dense<false> : vector<[4]xi1>
+// CHECK-DAG: %[[zero_mask_2d:.*]] = arith.constant dense<false> : vector<3x[4]xi1>
 // CHECK-NEXT: %[[create_mask_1d:.*]] = vector.create_mask %[[arg]] : vector<[4]xi1>
 // CHECK-NEXT: %[[res_0:.*]] = vector.insert %[[create_mask_1d]], %[[zero_mask_2d]] [0] : vector<[4]xi1> into vector<3x[4]xi1>
 // CHECK-NEXT: %[[res_1:.*]] = vector.insert %[[create_mask_1d]], %[[res_0]] [1] : vector<[4]xi1> into vector<3x[4]xi1>