[llvm] eb6429d - [CostModel][X86] Add uitpfp v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs

Fri May 21 03:31:30 PDT 2021

Author: Simon Pilgrim
Date: 2021-05-21T11:30:15+01:00
New Revision: eb6429d0fb94fd467e03d229177ae6ff3a44e3cc

URL: https://github.com/llvm/llvm-project/commit/eb6429d0fb94fd467e03d229177ae6ff3a44e3cc
DIFF: https://github.com/llvm/llvm-project/commit/eb6429d0fb94fd467e03d229177ae6ff3a44e3cc.diff

LOG: [CostModel][X86] Add uitpfp v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs

These were using (default) scalarized values.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/fptoui.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 4b8d245b3756c..af7b4202f5a2f 100644

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1877,12 +1877,12 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f64, 2 },
     { ISD::FP_TO_UINT,  MVT::v8i8,  MVT::v8f32, 4 },
     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 3 },
+    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 9 },
     // This node is expanded into scalarized operations but BasicTTI is overly
     // optimistic estimating its cost.  It computes 3 per element (one
     // vector-extract, one scalar conversion and one vector-insert).  The
     // problem is that the inserts form a read-modify-write chain so latency
     // should be factored in too.  Inflating the cost per element by 1.
-    { ISD::FP_TO_UINT,  MVT::v8i32, MVT::v8f32, 8*4 },
     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f64, 4*4 },
 
     { ISD::FP_EXTEND,   MVT::v4f64,  MVT::v4f32,  1 },
@@ -1985,6 +1985,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT,  MVT::v2i16,  MVT::v2f32,  2 },
     { ISD::FP_TO_UINT,  MVT::v2i16,  MVT::v2f64,  2 },
     { ISD::FP_TO_UINT,  MVT::v4i16,  MVT::v4f32,  4 },
+    { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  8 },
 
     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },

diff  --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index a3ec315c4eb46..85de13eda3648 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -233,28 +233,20 @@ define i32 @fptoui_float_i64(i32 %arg) {
 }
 
 define i32 @fptoui_float_i32(i32 %arg) {
-; SSE2-LABEL: 'fptoui_float_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SSE42-LABEL: 'fptoui_float_i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'fptoui_float_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'fptoui_float_i32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'fptoui_float_i32'
@@ -264,14 +256,6 @@ define i32 @fptoui_float_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
-;
-; SLM-LABEL: 'fptoui_float_i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = fptoui <2 x float> undef to <2 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I32 = fptoui float undef to i32
   %V2I32 = fptoui <2 x float> undef to <2 x i32>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 6233845e2a6ec..99f16a07e5626 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -61,55 +61,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 }
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
-; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x i32> [[R7]]
-;
-; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
-; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    ret <8 x i32> [[R72]]
-;
-; AVX512-LABEL: @fptosi_fptoui(
-; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @fptosi_fptoui(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 1c19a9a89467a..759744055bca9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -61,55 +61,11 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 }
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
-; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x i32> [[R7]]
-;
-; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[R7]]
-;
-; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    ret <8 x i32> [[R72]]
-;
-; AVX512-LABEL: @fptosi_fptoui(
-; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
-; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @fptosi_fptoui(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index c8cb290382e33..9544a4a820c5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX256NODQ
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256DQ
+; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX256DQ
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -74,11 +74,11 @@ define void @fptoui_8f64_8i64() #0 {
 ; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
-; AVX512-LABEL: @fptoui_8f64_8i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
-; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
-; AVX512-NEXT:    ret void
+; AVX512F-LABEL: @fptoui_8f64_8i64(
+; AVX512F-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512F-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
+; AVX512F-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512F-NEXT:    ret void
 ;
 ; AVX256DQ-LABEL: @fptoui_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
@@ -171,11 +171,11 @@ define void @fptoui_8f64_8i32() #0 {
 ; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
 ; AVX256NODQ-NEXT:    ret void
 ;
-; AVX-LABEL: @fptoui_8f64_8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
-; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
-; AVX-NEXT:    ret void
+; AVX512-LABEL: @fptoui_8f64_8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX512-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX512-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
   %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
@@ -352,11 +352,11 @@ define void @fptoui_8f32_8i64() #0 {
 ; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
 ; AVX256NODQ-NEXT:    ret void
 ;
-; AVX512-LABEL: @fptoui_8f32_8i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
-; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
-; AVX512-NEXT:    ret void
+; AVX512F-LABEL: @fptoui_8f32_8i64(
+; AVX512F-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512F-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
+; AVX512F-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512F-NEXT:    ret void
 ;
 ; AVX256DQ-LABEL: @fptoui_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
@@ -396,59 +396,14 @@ define void @fptoui_8f32_8i64() #0 {
 
 define void @fptoui_8f32_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i32(
-; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
-; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
-; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
-; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
-; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
-; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
-; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
-; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
-; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
-; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
-; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
-; AVX256NODQ-NEXT:    ret void
-;
 ; AVX-LABEL: @fptoui_8f32_8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
 ; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>