[llvm] 3ae7f7a - [CostModel][X86] Tweak fptoui v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs

Fri May 21 04:10:01 PDT 2021

Author: Simon Pilgrim
Date: 2021-05-21T12:09:31+01:00
New Revision: 3ae7f7ae0a33961be48948205981aea91920d3aa

URL: https://github.com/llvm/llvm-project/commit/3ae7f7ae0a33961be48948205981aea91920d3aa
DIFF: https://github.com/llvm/llvm-project/commit/3ae7f7ae0a33961be48948205981aea91920d3aa.diff

LOG: [CostModel][X86] Tweak fptoui v4f32->v4i32 + v8f32->v8i32 SSE/AVX costs

Adjust for worst case for atom/slm (SSE), btver2/sandybridge (AVX1) and haswell/znver* (AVX2)

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/cast.ll
    llvm/test/Analysis/CostModel/X86/uitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 18e85f030089..27841b702dd2 100644

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1792,7 +1792,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
 
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  5 },
   };
 
   static const TypeConversionCostTblEntry AVXConversionTbl[] = {
@@ -1857,7 +1857,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
-    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 8 },
     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 6 },
     // The generic code to compute the scalar overhead is currently broken.
@@ -1961,7 +1961,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
 

diff  --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 7764e47eb720..815b73ee7208 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -447,13 +447,13 @@ define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
 
 define void @uitofp4(<4 x i1> %a, <4 x i8> %b, <4 x i16> %c, <4 x i32> %d) {
 ; SSE-LABEL: 'uitofp4'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %A1 = uitofp <4 x i1> %a to <4 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %A2 = uitofp <4 x i1> %a to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <4 x i8> %b to <4 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %B2 = uitofp <4 x i8> %b to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <4 x i16> %c to <4 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %C2 = uitofp <4 x i16> %c to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = uitofp <4 x i32> %d to <4 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %D2 = uitofp <4 x i32> %d to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -495,21 +495,21 @@ define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'uitofp8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'uitofp8'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'uitofp8'

diff  --git a/llvm/test/Analysis/CostModel/X86/uitofp.ll b/llvm/test/Analysis/CostModel/X86/uitofp.ll
index 36bf919fecd7..42d1e59753df 100644
--- a/llvm/test/Analysis/CostModel/X86/uitofp.ll
+++ b/llvm/test/Analysis/CostModel/X86/uitofp.ll
@@ -209,26 +209,26 @@ define i32 @uitofp_i16_float() {
 define i32 @uitofp_i32_float() {
 ; SSE-LABEL: 'uitofp_i32_float'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'uitofp_i32_float'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'uitofp_i32_float'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f32 = uitofp i32 undef to float
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'uitofp_i32_float'

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 99f16a07e562..b49384f84842 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -7,31 +7,11 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
-; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R72]]
-;
-; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <8 x float> [[TMP3]]
-;
-; AVX-LABEL: @sitofp_uitofp(
-; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @sitofp_uitofp(
-; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @sitofp_uitofp(
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 759744055bca..013d2f8a2179 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -7,31 +7,11 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
-; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R72]]
-;
-; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <8 x float> [[TMP3]]
-;
-; AVX-LABEL: @sitofp_uitofp(
-; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @sitofp_uitofp(
-; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @sitofp_uitofp(
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1