[llvm] fe6c11c - [CostModel][X86] Improve f64/v2f64/v4f64 FMUL costs on AVX1 targets to account for slower btver2

Fri May 21 10:13:06 PDT 2021

Author: Simon Pilgrim
Date: 2021-05-21T18:12:13+01:00
New Revision: fe6c11c5710e95eedbb16c2aac58a5d992e55434

URL: https://github.com/llvm/llvm-project/commit/fe6c11c5710e95eedbb16c2aac58a5d992e55434
DIFF: https://github.com/llvm/llvm-project/commit/fe6c11c5710e95eedbb16c2aac58a5d992e55434.diff

LOG: [CostModel][X86] Improve f64/v2f64/v4f64 FMUL costs on AVX1 targets to account for slower btver2

BTVER2 has a weaker f64 multiplier that other AVX1-era targets, so we need to bump the worst case cost slightly - llvm-mca reports the new vectorization in simplebb is beneficial on btver2, bdver2 and sandybridge AVX1 targets

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/arith-fp.ll
    llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
    llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0cfe8d745763..83ee47691736 100644

--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -785,6 +785,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::FADD, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
     { ISD::FSUB, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
     { ISD::FSUB, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
+    { ISD::FMUL, MVT::f64,        1 }, // Haswell from http://www.agner.org/
+    { ISD::FMUL, MVT::v2f64,      1 }, // Haswell from http://www.agner.org/
     { ISD::FMUL, MVT::v4f64,      1 }, // Haswell from http://www.agner.org/
     { ISD::FMUL, MVT::v8f32,      1 }, // Haswell from http://www.agner.org/
 
@@ -828,6 +830,10 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::FNEG,    MVT::v4f64,      2 }, // BTVER2 from http://www.agner.org/
     { ISD::FNEG,    MVT::v8f32,      2 }, // BTVER2 from http://www.agner.org/
 
+    { ISD::FMUL,    MVT::f64,        2 }, // BTVER2 from http://www.agner.org/
+    { ISD::FMUL,    MVT::v2f64,      2 }, // BTVER2 from http://www.agner.org/
+    { ISD::FMUL,    MVT::v4f64,      4 }, // BTVER2 from http://www.agner.org/
+
     { ISD::FDIV,    MVT::f32,       14 }, // SNB from http://www.agner.org/
     { ISD::FDIV,    MVT::v4f32,     14 }, // SNB from http://www.agner.org/
     { ISD::FDIV,    MVT::v8f32,     28 }, // SNB from http://www.agner.org/

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
index 0d2d94c813db..89059c0681da 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll
@@ -461,10 +461,10 @@ define i32 @fmul(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fmul <4 x float> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fmul <8 x float> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fmul <16 x float> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = fmul double undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fmul <2 x double> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fmul <4 x double> undef, undef
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fmul <8 x double> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul <2 x double> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul <4 x double> undef, undef
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = fmul <8 x double> undef, undef
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'fmul'

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
index e74f7ff02b2b..63424bcb3276 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-fmul.ll
@@ -44,10 +44,10 @@ define void @reduce_f64(double %arg) {
 ;
 ; AVX1-LABEL: 'reduce_f64'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call double @llvm.vector.reduce.fmul.v1f64(double %arg, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call double @llvm.vector.reduce.fmul.v2f64(double %arg, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call double @llvm.vector.reduce.fmul.v4f64(double %arg, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call double @llvm.vector.reduce.fmul.v8f64(double %arg, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call double @llvm.vector.reduce.fmul.v16f64(double %arg, <16 x double> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'reduce_f64'

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
index 80a60dd4a3da..2a18d50b3893 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -64,15 +64,17 @@ define void @test_volatile_load(double* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test_volatile_load(
 ; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
 ; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
-; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
-; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
-; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[I4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %i0 = load volatile double, double* %a, align 8