[llvm] [TTI] Add cost model support for [u|s]cmp (PR #106824)

Yingwei Zheng via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 30 20:55:58 PDT 2024


https://github.com/dtcxzyw updated https://github.com/llvm/llvm-project/pull/106824

>From 7aea58f93e311730a7f5270e6fb97bfbbd1b0a90 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 31 Aug 2024 11:20:59 +0800
Subject: [PATCH 1/2] [TTI] Add cost model support for [u|s]cmp

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  33 +
 .../Analysis/CostModel/X86/icmp-codesize.ll   | 686 +++++++------
 .../Analysis/CostModel/X86/icmp-latency.ll    | 776 ++++++++-------
 .../CostModel/X86/icmp-sizelatency.ll         | 686 +++++++------
 llvm/test/Analysis/CostModel/X86/icmp.ll      | 936 +++++++++---------
 5 files changed, 1669 insertions(+), 1448 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 47323ca067a435..50dc7d5c54c54a 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2196,6 +2196,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::bitreverse:
       ISD = ISD::BITREVERSE;
       break;
+    case Intrinsic::ucmp:
+      ISD = ISD::UCMP;
+      break;
+    case Intrinsic::scmp:
+      ISD = ISD::SCMP;
+      break;
     }
 
     auto *ST = dyn_cast<StructType>(RetTy);
@@ -2433,6 +2439,33 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       }
       return Cost;
     }
+    case Intrinsic::ucmp:
+    case Intrinsic::scmp: {
+      Type *CmpTy = Tys[0];
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      InstructionCost Cost =
+          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
+                                      CmpIntrinsic::getGTPredicate(IID),
+                                      CostKind) +
+          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
+                                      CmpIntrinsic::getLTPredicate(IID),
+                                      CostKind);
+
+      if (TLI->shouldExpandCmpUsingSelects()) {
+        // x < y ? -1 : (x > y ? 1 : 0)
+        Cost += 2 * thisT()->getCmpSelInstrCost(
+                        BinaryOperator::Select, RetTy, CondTy,
+                        ICmpInst::BAD_ICMP_PREDICATE, CostKind);
+      } else {
+        // zext(x > y) - zext(x < y)
+        Cost +=
+            2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
+                                          TTI::CastContextHint::None, CostKind);
+        Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
+                                                CostKind);
+      }
+      return Cost;
+    }
     default:
       break;
     }
diff --git a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
index 6c1bfb72c85967..c4687da3632ca7 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
@@ -2367,187 +2367,210 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'scmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'scmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE42-LABEL: 'scmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'scmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'scmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'scmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'scmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'scmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'scmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -2579,187 +2602,210 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'ucmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'ucmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE42-LABEL: 'ucmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'ucmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'ucmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'ucmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'ucmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'ucmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'ucmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
diff --git a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
index efa903ea2819c8..00546243958d78 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
@@ -2597,210 +2597,233 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'scmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; SSE4-LABEL: 'scmp_int'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE41-LABEL: 'scmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'scmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'scmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'scmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'scmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'scmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'scmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'scmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'scmp_int'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 372 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 744 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -2832,210 +2855,233 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'ucmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
-; SSE4-LABEL: 'ucmp_int'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE41-LABEL: 'ucmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'ucmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ucmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'ucmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'ucmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'ucmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'ucmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'ucmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'ucmp_int'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 372 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 744 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -3065,3 +3111,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
   ret i32 undef
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE4: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
index 4fc7c68be26f78..ad49e6883cfe1b 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
@@ -2367,187 +2367,210 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'scmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'scmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE42-LABEL: 'scmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'scmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'scmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'scmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'scmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'scmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'scmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -2579,187 +2602,210 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'ucmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SSE41-LABEL: 'ucmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE42-LABEL: 'ucmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'ucmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'ucmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'ucmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'ucmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'ucmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'ucmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
diff --git a/llvm/test/Analysis/CostModel/X86/icmp.ll b/llvm/test/Analysis/CostModel/X86/icmp.ll
index d8959a67145d63..2c791a8b2420ea 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp.ll
@@ -3057,256 +3057,279 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'scmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'scmp_int'
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'scmp_int'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; SSE4-LABEL: 'scmp_int'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE41-LABEL: 'scmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'scmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'scmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'scmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'scmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'scmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'scmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'scmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'scmp_int'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 372 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 744 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -3338,256 +3361,279 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
 ; SSE2-LABEL: 'ucmp_int'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE3-LABEL: 'ucmp_int'
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'ucmp_int'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; SSE4-LABEL: 'ucmp_int'
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE41-LABEL: 'ucmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'ucmp_int'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ucmp_int'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'ucmp_int'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'ucmp_int'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'ucmp_int'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 198 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 204 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; XOPAVX1-LABEL: 'ucmp_int'
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; XOPAVX2-LABEL: 'ucmp_int'
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 392 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 200 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'ucmp_int'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 372 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 744 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I8     = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -3617,3 +3663,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
   ret i32 undef
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE4: {{.*}}

>From 0d04e20a655303abf68b519685a8d2abb4784ec5 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sat, 31 Aug 2024 11:38:53 +0800
Subject: [PATCH 2/2] [AArch64][Analysis] Add cost model tests for AArch64

---
 llvm/test/Analysis/CostModel/AArch64/cmp.ll | 52 +++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index 4aed33f42d469e..1b4b5eb616b5a9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -121,3 +121,55 @@ define void @andcmp() {
   %c64sle = icmp sle i64 %a64, 0
   ret void
 }
+
+define void @uscmp() {
+; CHECK-THROUGHPUT-LABEL: 'uscmp'
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u8 = call i8 @llvm.ucmp.i8.i8(i8 undef, i8 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-SIZE-LABEL: 'uscmp'
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u8 = call i8 @llvm.ucmp.i8.i8(i8 undef, i8 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %u8 = call i8 @llvm.ucmp(i8 undef, i8 undef)
+  %u16 = call i16 @llvm.ucmp(i16 undef, i16 undef)
+  %u32 = call i32 @llvm.ucmp(i32 undef, i32 undef)
+  %u64 = call i64 @llvm.ucmp(i64 undef, i64 undef)
+  %uv16i8 = call <16 x i8> @llvm.ucmp(<16 x i8> undef, <16 x i8> undef)
+  %uv8i16 = call <8 x i16> @llvm.ucmp(<8 x i16> undef, <8 x i16> undef)
+  %uv4i32 = call <4 x i32> @llvm.ucmp(<4 x i32> undef, <4 x i32> undef)
+  %s8 = call i8 @llvm.scmp(i8 undef, i8 undef)
+  %s16 = call i16 @llvm.scmp(i16 undef, i16 undef)
+  %s32 = call i32 @llvm.scmp(i32 undef, i32 undef)
+  %s64 = call i64 @llvm.scmp(i64 undef, i64 undef)
+  %sv16i8 = call <16 x i8> @llvm.scmp(<16 x i8> undef, <16 x i8> undef)
+  %sv8i16 = call <8 x i16> @llvm.scmp(<8 x i16> undef, <8 x i16> undef)
+  %sv4i32 = call <4 x i32> @llvm.scmp(<4 x i32> undef, <4 x i32> undef)
+  ret void
+}



More information about the llvm-commits mailing list