[llvm] 941feb7 - [CostModel][X86] Fix SSE41/SSE42 cost checks on icmp tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 30 09:19:10 PDT 2024


Author: Simon Pilgrim
Date: 2024-08-30T17:18:59+01:00
New Revision: 941feb76c8186d2e237690511b48f57c6bda282b

URL: https://github.com/llvm/llvm-project/commit/941feb76c8186d2e237690511b48f57c6bda282b
DIFF: https://github.com/llvm/llvm-project/commit/941feb76c8186d2e237690511b48f57c6bda282b.diff

LOG: [CostModel][X86] Fix SSE41/SSE42 cost checks on icmp tests

Noticed on #106747 - some SSE41 tests didn't match the SSE2 baseline so we were missing ALL the checks :(

Added: 
    

Modified: 
    llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
    llvm/test/Analysis/CostModel/X86/icmp-latency.ll
    llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
    llvm/test/Analysis/CostModel/X86/icmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
index 2dc6737a3d8a07..6c1bfb72c85967 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-codesize.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -11,8 +11,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 ;
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE4,SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_eq'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_eq'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
@@ -251,6 +274,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ne'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ne'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
@@ -463,6 +509,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
@@ -652,6 +721,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_uge'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_uge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_uge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
@@ -864,6 +979,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sgt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sgt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
@@ -1076,6 +1214,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ugt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ugt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
@@ -1288,6 +1449,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sle'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sle'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
@@ -1477,6 +1661,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_ule'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_ule'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ule'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
@@ -1689,6 +1919,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_slt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_slt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
@@ -1901,6 +2154,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ult'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ult'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
@@ -2090,6 +2366,52 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'scmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'scmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2256,6 +2578,52 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 }
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'ucmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'ucmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2421,3 +2789,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
   ret i32 undef
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE4: {{.*}}

diff  --git a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
index 726a6dd782a4f7..efa903ea2819c8 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-latency.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -12,7 +12,7 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 ;
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_eq'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_eq'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
@@ -274,6 +297,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ne'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ne'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
@@ -509,6 +555,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
@@ -721,6 +790,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_uge'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_uge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_uge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
@@ -956,6 +1071,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sgt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sgt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
@@ -1191,6 +1329,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ugt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ugt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
@@ -1426,6 +1587,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sle'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sle'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
@@ -1638,6 +1822,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_ule'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_ule'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ule'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
@@ -1873,6 +2103,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_slt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_slt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
@@ -2108,6 +2361,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ult'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ult'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
@@ -2320,28 +2596,51 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
-; SSE42-LABEL: 'scmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE2-LABEL: 'scmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'scmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -2532,28 +2831,51 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 }
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
-; SSE42-LABEL: 'ucmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; SSE2-LABEL: 'ucmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'ucmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)

diff  --git a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
index 0deaad5991fb2f..4fc7c68be26f78 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp-sizelatency.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse3 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -11,8 +11,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 ;
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE4,SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -39,6 +39,29 @@ define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_eq'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp eq <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp eq <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp eq <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp eq <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_eq'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> %argv16i8, %argv16i8
@@ -251,6 +274,29 @@ define i32 @cmp_int_ne(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ne'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp ne <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp ne <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp ne <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp ne <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ne'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> %argv16i8, %argv16i8
@@ -463,6 +509,29 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> %argv16i8, %argv16i8
@@ -652,6 +721,52 @@ define i32 @cmp_int_sge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_uge(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_uge'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_uge'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp uge <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp uge <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp uge <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp uge <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_uge'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> %argv16i8, %argv16i8
@@ -864,6 +979,29 @@ define i32 @cmp_int_sgt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sgt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp sgt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp sgt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp sgt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp sgt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sgt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> %argv16i8, %argv16i8
@@ -1076,6 +1214,29 @@ define i32 @cmp_int_ugt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ugt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ugt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ugt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ugt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ugt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ugt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> %argv16i8, %argv16i8
@@ -1288,6 +1449,29 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_sle'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = icmp sle <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = icmp sle <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = icmp sle <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = icmp sle <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_sle'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> %argv16i8, %argv16i8
@@ -1477,6 +1661,52 @@ define i32 @cmp_int_sle(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @cmp_int_ule(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'cmp_int_ule'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'cmp_int_ule'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = icmp ule <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = icmp ule <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = icmp ule <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = icmp ule <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ule'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> %argv16i8, %argv16i8
@@ -1689,6 +1919,29 @@ define i32 @cmp_int_slt(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_slt'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp slt <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp slt <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp slt <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp slt <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_slt'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> %argv16i8, %argv16i8
@@ -1901,6 +2154,29 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
+; SSE41-LABEL: 'cmp_int_ult'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> %argv32i8, %argv32i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> %argv64i8, %argv64i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> %argv128i8, %argv128i8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 %arg16, %arg16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> %argv8i16, %argv8i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> %argv16i16, %argv16i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> %argv32i16, %argv32i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> %argv64i16, %argv64i16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 %arg32, %arg32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> %argv4i32, %argv4i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> %argv8i32, %argv8i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> %argv16i32, %argv16i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> %argv32i32, %argv32i32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 %arg64, %arg64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = icmp ult <2 x i64> %argv2i64, %argv2i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = icmp ult <4 x i64> %argv4i64, %argv4i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = icmp ult <8 x i64> %argv8i64, %argv8i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I64 = icmp ult <16 x i64> %argv16i64, %argv16i64
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; SSE42-LABEL: 'cmp_int_ult'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 %arg8, %arg8
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> %argv16i8, %argv16i8
@@ -2090,6 +2366,52 @@ define i32 @cmp_int_ult(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64
 }
 
 define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'scmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'scmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2256,6 +2578,52 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 }
 
 define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
+; SSE2-LABEL: 'ucmp_int'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 312 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 624 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'ucmp_int'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
@@ -2421,3 +2789,5 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
   ret i32 undef
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE4: {{.*}}

diff  --git a/llvm/test/Analysis/CostModel/X86/icmp.ll b/llvm/test/Analysis/CostModel/X86/icmp.ll
index 599895c2b5705a..d8959a67145d63 100644
--- a/llvm/test/Analysis/CostModel/X86/icmp.ll
+++ b/llvm/test/Analysis/CostModel/X86/icmp.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4,SSE41
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX1
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
@@ -12,7 +12,7 @@
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mattr=+xop,+avx2 | FileCheck %s -check-prefixes=XOPAVX2
 ;
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE4,SSE42
 ; RUN: opt < %s -mtriple=x86_64-- -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 define i32 @cmp_int_eq(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i8> %argv64i8, <128 x i8> %argv128i8, i16 %arg16, <8 x i16> %argv8i16, <16 x i16> %argv16i16, <32 x i16> %argv32i16, <64 x i16> %argv64i16, i32 %arg32, <4 x i32> %argv4i32, <8 x i32> %argv8i32, <16 x i32> %argv16i32, <32 x i32> %argv32i32, i64 %arg64, <2 x i64> %argv2i64, <4 x i64> %argv4i64, <8 x i64> %argv8i64, <16 x i64> %argv16i64) {
@@ -3125,51 +3125,28 @@ define i32 @scmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; SSE41-LABEL: 'scmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'scmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-LABEL: 'scmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.scmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.scmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.scmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.scmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.scmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.scmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.scmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.scmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.scmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.scmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.scmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.scmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.scmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.scmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.scmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.scmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'scmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.scmp.i8.i8(i8 %arg8, i8 %arg8)
@@ -3429,51 +3406,28 @@ define i32 @ucmp_int(i8 %arg8, <16 x i8> %argv16i8, <32 x i8> %argv32i8, <64 x i
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; SSE41-LABEL: 'ucmp_int'
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'ucmp_int'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE4-LABEL: 'ucmp_int'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> %argv16i8, <16 x i8> %argv16i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.ucmp.v32i8.v32i8(<32 x i8> %argv32i8, <32 x i8> %argv32i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.ucmp.v64i8.v64i8(<64 x i8> %argv64i8, <64 x i8> %argv64i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V128I8 = call <128 x i8> @llvm.ucmp.v128i8.v128i8(<128 x i8> %argv128i8, <128 x i8> %argv128i8)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.ucmp.i16.i16(i16 %arg16, i16 %arg16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> %argv8i16, <8 x i16> %argv8i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.ucmp.v16i16.v16i16(<16 x i16> %argv16i16, <16 x i16> %argv16i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ucmp.v32i16.v32i16(<32 x i16> %argv32i16, <32 x i16> %argv32i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I16 = call <64 x i16> @llvm.ucmp.v64i16.v64i16(<64 x i16> %argv64i16, <64 x i16> %argv64i16)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ucmp.i32.i32(i32 %arg32, i32 %arg32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %argv4i32, <4 x i32> %argv4i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.ucmp.v8i32.v8i32(<8 x i32> %argv8i32, <8 x i32> %argv8i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.ucmp.v16i32.v16i32(<16 x i32> %argv16i32, <16 x i32> %argv16i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I32 = call <32 x i32> @llvm.ucmp.v32i32.v32i32(<32 x i32> %argv32i32, <32 x i32> %argv32i32)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.ucmp.i64.i64(i64 %arg64, i64 %arg64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.ucmp.v2i64.v2i64(<2 x i64> %argv2i64, <2 x i64> %argv2i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.ucmp.v4i64.v4i64(<4 x i64> %argv4i64, <4 x i64> %argv4i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.ucmp.v8i64.v8i64(<8 x i64> %argv8i64, <8 x i64> %argv8i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I64 = call <16 x i64> @llvm.ucmp.v16i64.v16i64(<16 x i64> %argv16i64, <16 x i64> %argv16i64)
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'ucmp_int'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.ucmp.i8.i8(i8 %arg8, i8 %arg8)


        


More information about the llvm-commits mailing list