[llvm] 32dd587 - Temporarily Revert "[CostModel] remove cost-kind predicate for intrinsics in basic TTI implementation"

Thu Nov 19 22:38:10 PST 2020

Author: Eric Christopher
Date: 2020-11-19T22:10:23-08:00
New Revision: 32dd5870ee31af8ff895e6e2522f46182901d1b6

URL: https://github.com/llvm/llvm-project/commit/32dd5870ee31af8ff895e6e2522f46182901d1b6
DIFF: https://github.com/llvm/llvm-project/commit/32dd5870ee31af8ff895e6e2522f46182901d1b6.diff

LOG: Temporarily Revert "[CostModel] remove cost-kind predicate for intrinsics in basic TTI implementation"
as it's causing crashes in the optimizer. A reduced testcase has been posted as a follow-up.

This reverts commit f7eac51b9b3f780c96ca41913293851c5acb465b.

Temporarily Revert "[CostModel] make default size cost for libcalls small (again)" as it depends upon the primary revert.

This reverts commit 8ec7ea3ddce7379e13e8dfb4a5260a6d2004aa1c.

Temporarily Revert "[CostModel] add tests for math library calls; NFC" as it depends upon the primary revert.

This reverts commit df09f825995b10da03f148133c119f52c94fd6e4.

Temporarily Revert "[LoopUnroll] add test for full unroll that is sensitive to cost-model; NFC" as it depends upon the primary revert.

This reverts commit 618d555e8d926a83161774df2035519c387269db.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
    llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
    llvm/test/Analysis/CostModel/ARM/arith-usat.ll
    llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
    llvm/test/Analysis/CostModel/X86/fmaxnum-size-latency.ll
    llvm/test/Analysis/CostModel/X86/fminnum-size-latency.ll
    llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
    llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll

Removed: 
    llvm/test/Transforms/LoopUnroll/AArch64/thresholdO3-cost-model.ll


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0b3d7704028a..0bea354345d5 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1166,6 +1166,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     FastMathFlags FMF = ICA.getFlags();
     switch (IID) {
     default:
+      // FIXME: all cost kinds should default to the same thing?
+      if (CostKind != TTI::TCK_RecipThroughput)
+        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       break;
 
     case Intrinsic::cttz:
@@ -1315,9 +1318,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
     }
 
-    // Library call cost - other than size, make it expensive.
-    unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
     SmallVector<unsigned, 2> ISDs;
+    unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
     switch (IID) {
     default: {
       // Assume that we need to scalarize this intrinsic.

diff  --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
index 25b268b9b244..050f2a790533 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
@@ -85,60 +85,60 @@ define i32 @sadd(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'sadd'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sadd'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sadd'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
@@ -243,60 +243,60 @@ define i32 @uadd(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'uadd'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'uadd'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'uadd'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
@@ -401,60 +401,60 @@ define i32 @ssub(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'ssub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'ssub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'ssub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
@@ -559,60 +559,60 @@ define i32 @usub(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'usub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'usub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'usub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
@@ -717,60 +717,60 @@ define i32 @smul(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'smul'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'smul'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'smul'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 557 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
@@ -875,60 +875,60 @@ define i32 @umul(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'umul'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'umul'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'umul'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 181 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 553 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)

diff  --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
index 66c99d804b26..d5afce84b136 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
@@ -85,60 +85,60 @@ define i32 @add(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'add'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'add'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
@@ -243,60 +243,60 @@ define i32 @sub(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'sub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)

diff  --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
index 036cb607753d..1059c2ee551c 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
@@ -85,60 +85,60 @@ define i32 @add(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'add'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'add'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
@@ -243,60 +243,60 @@ define i32 @sub(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'sub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)

diff  --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index 66bb10e8474f..4e18acad1617 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -16,9 +16,6 @@ declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
 declare float @llvm.fmuladd.f32(float, float, float)
 declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
 
-declare float @llvm.log2.f32(float)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-
 declare i32 @llvm.cttz.i32(i32, i1)
 declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
 
@@ -46,13 +43,13 @@ define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'smax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'smax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
@@ -73,12 +70,12 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float
 ;
 ; SIZE-LABEL: 'fmuladd'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'fmuladd'
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
@@ -86,32 +83,6 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float
   ret void
 }
 
-define void @log2(float %a, <16 x float> %va) {
-; THRU-LABEL: 'log2'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'log2'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'log2'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'log2'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.log2.f32(float %a)
-  %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-  ret void
-}
-
 define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-LABEL: 'cttz'
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)

diff  --git a/llvm/test/Analysis/CostModel/X86/fmaxnum-size-latency.ll b/llvm/test/Analysis/CostModel/X86/fmaxnum-size-latency.ll
index 4b22b74f7a56..25aafc0bdfcf 100644
--- a/llvm/test/Analysis/CostModel/X86/fmaxnum-size-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fmaxnum-size-latency.ll
@@ -1,23 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s
 
 define i32 @f32(i32 %arg) {
-; SSE2-LABEL: 'f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'f32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call float @llvm.maxnum.f32(float undef, float undef)
   %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
@@ -28,21 +20,13 @@ define i32 @f32(i32 %arg) {
 }
 
 define i32 @f64(i32 %arg) {
-; SSE2-LABEL: 'f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'f64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %f64 = call double @llvm.maxnum.f64(double undef, double undef)
   %V2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)

diff  --git a/llvm/test/Analysis/CostModel/X86/fminnum-size-latency.ll b/llvm/test/Analysis/CostModel/X86/fminnum-size-latency.ll
index 666e7d33f34e..b81b4d817758 100644
--- a/llvm/test/Analysis/CostModel/X86/fminnum-size-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fminnum-size-latency.ll
@@ -1,23 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -cost-kind=size-latency -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s
 
 define i32 @f32(i32 %arg) {
-; SSE2-LABEL: 'f32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'f32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'f32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call float @llvm.minnum.f32(float undef, float undef)
   %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
@@ -28,21 +20,13 @@ define i32 @f32(i32 %arg) {
 }
 
 define i32 @f64(i32 %arg) {
-; SSE2-LABEL: 'f64'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX2-LABEL: 'f64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-LABEL: 'f64'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %f64 = call double @llvm.minnum.f64(double undef, double undef)
   %V2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)

diff  --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 9cd8614764dc..bdc1a8be673c 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -19,9 +19,6 @@ declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
 declare float @llvm.fmuladd.f32(float, float, float)
 declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
 
-declare float @llvm.log2.f32(float)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-
 declare i32 @llvm.cttz.i32(i32, i1)
 declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
 
@@ -51,13 +48,13 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'umul'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'umul'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
@@ -77,13 +74,13 @@ define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'smax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'smax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
@@ -103,13 +100,13 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmuladd'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'fmuladd'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
@@ -117,32 +114,6 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float
   ret void
 }
 
-define void @log2(float %a, <16 x float> %va) {
-; THRU-LABEL: 'log2'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'log2'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'log2'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'log2'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.log2.f32(float %a)
-  %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-  ret void
-}
-
 define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-LABEL: 'cttz'
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)

diff  --git a/llvm/test/Transforms/LoopUnroll/AArch64/thresholdO3-cost-model.ll b/llvm/test/Transforms/LoopUnroll/AArch64/thresholdO3-cost-model.ll
deleted file mode 100644
index 668bc69e5a0c..000000000000
--- a/llvm/test/Transforms/LoopUnroll/AArch64/thresholdO3-cost-model.ll
+++ /dev/null
@@ -1,96 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-unroll -unroll-threshold=300 -S %s | FileCheck %s
-
-; This test was fully unrolled and simplified at -O3 with clang 11.
-; Changes to the cost model may cause that decision to 
diff er.
-; We would not necessarily view the 
diff erence as a regression,
-; but we should be aware that cost model changes can affect an
-; example like this drastically.
-
-target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-w64-windows-gnu"
-
- at tab_log2 = internal unnamed_addr constant [33 x i16] [i16 4, i16 1459, i16 2870, i16 4240, i16 5572, i16 6867, i16 8127, i16 9355, i16 10552, i16 11719, i16 12858, i16 13971, i16 15057, i16 16120, i16 17158, i16 18175, i16 19170, i16 20145, i16 21100, i16 22036, i16 22954, i16 23854, i16 24738, i16 25605, i16 26457, i16 27294, i16 28116, i16 28924, i16 29719, i16 30500, i16 31269, i16 32025, i16 -32767], align 2
-
-declare i32 @llvm.ctlz.i32(i32, i1 immarg)
-declare double @llvm.log2.f64(double)
-
-define i32 @tripcount_11() {
-; CHECK-LABEL: @tripcount_11(
-; CHECK-NEXT:  do.body6.preheader:
-; CHECK-NEXT:    br label [[DO_BODY6:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_1:%.*]], label [[IF_THEN11:%.*]]
-; CHECK:       do.body6:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND:%.*]], label [[IF_THEN11]]
-; CHECK:       if.then11:
-; CHECK-NEXT:    unreachable
-; CHECK:       for.cond.1:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_2:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.2:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_3:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.3:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_4:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.4:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_5:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.5:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_6:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.6:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_7:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.7:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_8:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.8:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_9:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.9:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_10:%.*]], label [[IF_THEN11]]
-; CHECK:       for.cond.10:
-; CHECK-NEXT:    ret i32 0
-;
-do.body6.preheader:
-  br label %do.body6
-
-for.cond:
-  %cmp5.not = icmp eq i32 %div20, 0
-  br i1 %cmp5.not, label %for.end, label %do.body6
-
-do.body6:
-  %i.021 = phi i32 [ %div20, %for.cond ], [ 1024, %do.body6.preheader ]
-  %or.i = or i32 %i.021, 1
-  %t0 = tail call i32 @llvm.ctlz.i32(i32 %or.i, i1 true)
-  %shl.i = shl i32 %i.021, %t0
-  %and.i = lshr i32 %shl.i, 26
-  %t1 = trunc i32 %and.i to i8
-  %conv3.i = and i8 %t1, 31
-  %and4.i = lshr i32 %shl.i, 11
-  %conv6.i = and i32 %and4.i, 32767
-  %idxprom.i = zext i8 %conv3.i to i64
-  %arrayidx.i7 = getelementptr inbounds [33 x i16], [33 x i16]* @tab_log2, i64 0, i64 %idxprom.i
-  %t2 = load i16, i16* %arrayidx.i7, align 2
-  %conv7.i = zext i16 %t2 to i32
-  %narrow.i = add nuw nsw i8 %conv3.i, 1
-  %t3 = zext i8 %narrow.i to i64
-  %arrayidx11.i = getelementptr inbounds [33 x i16], [33 x i16]* @tab_log2, i64 0, i64 %t3
-  %t4 = load i16, i16* %arrayidx11.i, align 2
-  %conv12.i = zext i16 %t4 to i32
-  %sub16.i = sub nsw i32 %conv12.i, %conv7.i
-  %mul.i8 = mul nsw i32 %conv6.i, %sub16.i
-  %shr17.i = ashr i32 %mul.i8, 15
-  %conv.i = shl nuw nsw i32 %t0, 15
-  %shl20.i = xor i32 %conv.i, 1015808
-  %add18.i = add nuw nsw i32 %shl20.i, %conv7.i
-  %add21.i = add nsw i32 %add18.i, %shr17.i
-  %conv = sitofp i32 %i.021 to double
-  %t5 = tail call double @llvm.log2.f64(double %conv)
-  %conv8 = fptosi double %t5 to i32
-  %mul = shl nsw i32 %conv8, 15
-  %add = or i32 %mul, 4
-  %cmp9 = icmp eq i32 %add21.i, %add
-  %div20 = lshr i32 %i.021, 1
-  br i1 %cmp9, label %for.cond, label %if.then11
-
-if.then11:
-  unreachable
-
-for.end:
-  ret i32 0
-}

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index f2dd89ea7d9f..b55babaea3df 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -95,7 +95,7 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FVW2:       vector.body:
-; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
 ; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
 ; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
@@ -125,9 +125,39 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
 ; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
 ; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
 ; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
-; FVW2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX6]], 4
-; FVW2-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 4096
-; FVW2-NEXT:    br i1 [[TMP20]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; FVW2-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; FVW2-NEXT:    [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
+; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
+; FVW2-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
+; FVW2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
+; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
+; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
+; FVW2-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
 ;