[llvm] 47941d6 - [CostModel][X86] Adjust fp<->int vXi32 AVX1+ costs based on llvm-mca reports

Wed Jun 30 07:34:41 PDT 2021

Author: Simon Pilgrim
Date: 2021-06-30T15:23:34+01:00
New Revision: 47941d601debe2d543a5f9452c777072c708ccdd

URL: https://github.com/llvm/llvm-project/commit/47941d601debe2d543a5f9452c777072c708ccdd
DIFF: https://github.com/llvm/llvm-project/commit/47941d601debe2d543a5f9452c777072c708ccdd.diff

LOG: [CostModel][X86] Adjust fp<->int vXi32 AVX1+ costs based on llvm-mca reports

Based off the worse case numbers generated by D103695, the AVX1/2/512 sitofp/uitofp/fptosi/fptoui costs were higher than necessary (based off instruction counts instead of actual throughput).

The SSE costs still need further fixes, but I hit an issue with the order in which SSE costs are checked - we need to check CUSTOM costs (with non-legal types) first, and then fallback to LEGALIZED types. I'm looking at this now, and this should let us start thinning out a lot of the duplicates in the costs tables.

Then we can finally start work on vXi64 / vXi16 / vXi8 / vXi1 integers, which should let us look at sub-128-bit vectorization (D103925).

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/cast.ll
    llvm/test/Analysis/CostModel/X86/fptosi.ll
    llvm/test/Analysis/CostModel/X86/fptoui.ll
    llvm/test/Analysis/CostModel/X86/sitofp.ll
    llvm/test/Analysis/CostModel/X86/uitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 9425027699646..d245324cc9ce6 100644

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1696,8 +1696,10 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
     { ISD::FP_TO_SINT,  MVT::v8i8,   MVT::v8f64,  3 },
     { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  1 },
     { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 3 },
     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v16i32, MVT::v16f64, 3 },
 
     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
     { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
@@ -1822,7 +1824,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
@@ -1878,7 +1880,27 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
 
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  5 },
+    { ISD::FP_TO_SINT,  MVT::v4i32,  MVT::v4f64,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i32,  MVT::v8f64,  3 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  4 },
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f64,  7 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  7 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  4 },
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64, 15 },
+
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  3 },
+
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  4 },
   };
 
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
@@ -1928,9 +1950,9 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
-    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
-    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
-    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::v8f64, MVT::v8i32, 4 },
 
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
@@ -1941,10 +1963,11 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
-    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
-    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 4 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 5 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 8 },
+    { ISD::UINT_TO_FP,  MVT::v8f64, MVT::v8i32, 10 },
     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 6 },
     // The generic code to compute the scalar overhead is currently broken.
@@ -1958,13 +1981,21 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 4 },
     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f64, 3 },
     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f64, 2 },
     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v8i32, MVT::v8f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::v8i32, MVT::v8f64, 5 },
 
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f32, 5 },
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 9 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 5 },
     { ISD::FP_TO_UINT,  MVT::v4i8,  MVT::v4f64, 3 },
     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 9 },
     { ISD::FP_TO_UINT,  MVT::v8i8,  MVT::v8f32, 4 },
     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 3 },
     { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 9 },
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f64, 19 },
     // This node is expanded into scalarized operations but BasicTTI is overly
     // optimistic estimating its cost.  It computes 3 per element (one
     // vector-extract, one scalar conversion and one vector-insert).  The
@@ -2030,6 +2061,12 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i64,  1 }, // PSHUFB
 
+    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v4i32,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  2 },
+
     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
 

diff  --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 305ca54aa4b11..60fd218a19f6c 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -372,27 +372,49 @@ define i32 @masks4(<4 x i1> %in) {
 }
 
 define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
-; SSE-LABEL: 'sitofp4'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'sitofp4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE41-LABEL: 'sitofp4'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'sitofp4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'sitofp4'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-LABEL: 'sitofp4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sitofp <4 x i1> %a to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B1 = sitofp <4 x i8> %b to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %B2 = sitofp <4 x i8> %b to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C1 = sitofp <4 x i16> %c to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %C2 = sitofp <4 x i16> %c to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <4 x i32> %d to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D2 = sitofp <4 x i32> %d to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'sitofp4'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = sitofp <4 x i1> %a to <4 x float>
@@ -417,19 +439,33 @@ define void @sitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
 }
 
 define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
-; SSE-LABEL: 'sitofp8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'sitofp8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE41-LABEL: 'sitofp8'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'sitofp8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'sitofp8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX2-LABEL: 'sitofp8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'sitofp8'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float>
@@ -457,16 +493,27 @@ define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'uitofp4'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'uitofp4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'uitofp4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'uitofp4'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
@@ -509,7 +556,7 @@ define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'uitofp8'

diff  --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll
index 3ada6d104b612..fd84c5eafdae3 100644
--- a/llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 ;
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 define i32 @fptosi_double_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptosi_double_i64'
@@ -68,12 +68,19 @@ define i32 @fptosi_double_i32(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'fptosi_double_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'fptosi_double_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'fptosi_double_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptosi_double_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
@@ -227,13 +234,21 @@ define i32 @fptosi_float_i32(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'fptosi_float_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x float> undef to <2 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'fptosi_float_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x float> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'fptosi_float_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = fptosi <2 x float> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptosi_float_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32

diff  --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index 85de13eda3648..390aeaaef4fd6 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 ;
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 define i32 @fptoui_double_i64(i32 %arg) {
 ; SSE2-LABEL: 'fptoui_double_i64'
@@ -75,12 +75,19 @@ define i32 @fptoui_double_i32(i32 %arg) {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'fptoui_double_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'fptoui_double_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'fptoui_double_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptoui_double_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
@@ -241,13 +248,21 @@ define i32 @fptoui_float_i32(i32 %arg) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'fptoui_float_i32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'fptoui_float_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'fptoui_float_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptoui_float_i32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32

diff  --git a/llvm/test/Analysis/CostModel/X86/sitofp.ll b/llvm/test/Analysis/CostModel/X86/sitofp.ll
index a6e34a80c47e9..718467c1e7832 100644
--- a/llvm/test/Analysis/CostModel/X86/sitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/sitofp.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 define i32 @sitofp_i8_double() {
 ; SSE-LABEL: 'sitofp_i8_double'
@@ -69,23 +69,37 @@ define i32 @sitofp_i16_double() {
 }
 
 define i32 @sitofp_i32_double() {
-; SSE-LABEL: 'sitofp_i32_double'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'sitofp_i32_double'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'sitofp_i32_double'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-LABEL: 'sitofp_i32_double'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sitofp_i32_double'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sitofp_i32_double'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'sitofp_i32_double'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = sitofp i32 undef to double
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -200,25 +214,41 @@ define i32 @sitofp_i16_float() {
 }
 
 define i32 @sitofp_i32_float() {
-; SSE-LABEL: 'sitofp_i32_float'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'sitofp_i32_float'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'sitofp_i32_float'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE42-LABEL: 'sitofp_i32_float'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sitofp_i32_float'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sitofp_i32_float'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'sitofp_i32_float'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = sitofp i32 undef to float
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>

diff  --git a/llvm/test/Analysis/CostModel/X86/uitofp.ll b/llvm/test/Analysis/CostModel/X86/uitofp.ll
index 42d1e59753dfb..deb6bd496e13e 100644
--- a/llvm/test/Analysis/CostModel/X86/uitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/uitofp.ll
@@ -76,12 +76,19 @@ define i32 @uitofp_i32_double() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
-; AVX-LABEL: 'uitofp_i32_double'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; AVX1-LABEL: 'uitofp_i32_double'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'uitofp_i32_double'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'uitofp_i32_double'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
@@ -218,22 +225,22 @@ define i32 @uitofp_i32_float() {
 ; AVX1-LABEL: 'uitofp_i32_float'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'uitofp_i32_float'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'uitofp_i32_float'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index 9544a4a820c5b..5b5f5b051bae4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,XOP
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX256DQ
 
@@ -144,32 +144,44 @@ define void @fptoui_8f64_8i32() #0 {
 ; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @fptoui_8f64_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    ret void
+; AVX1-LABEL: @fptoui_8f64_8i32(
+; AVX1-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
+; AVX1-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
+; AVX1-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
+; AVX1-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
+; AVX1-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
+; AVX1-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
+; AVX1-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
+; AVX1-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
+; AVX1-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX1-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; AVX1-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; AVX1-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; AVX1-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX1-NEXT:    ret void
+;
+; XOP-LABEL: @fptoui_8f64_8i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; XOP-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+; AVX2-LABEL: @fptoui_8f64_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fptoui_8f64_8i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
index 7b4677cc7e19b..4b4b01ba8ce15 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -216,14 +216,20 @@ define void @sitofp_8i64_8f64() #0 {
 }
 
 define void @sitofp_2i32_2f64() #0 {
-; CHECK-LABEL: @sitofp_2i32_2f64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_2i32_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_2i32_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index f187f6385f57a..ef63088afbb7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -216,14 +216,20 @@ define void @sitofp_8i64_8f64() #0 {
 }
 
 define void @sitofp_2i32_2f64() #0 {
-; CHECK-LABEL: @sitofp_2i32_2f64(
-; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
-; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_2i32_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_2i32_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
   %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
index e07743d27834d..dfc7d64103390 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256NODQ,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256,AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -135,14 +135,20 @@ define void @uitofp_2i32_2f64() #0 {
 ; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @uitofp_2i32_2f64(
-; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
-; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256NODQ-NEXT:    ret void
+; AVX1-LABEL: @uitofp_2i32_2f64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; AVX1-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX1-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @uitofp_2i32_2f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX2-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX2-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_2i32_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64