[llvm] d21bf51 - [CostModel][X86] Adjust pre-SSE41 fp scalar select costs to account for vector ops
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri May 6 03:42:08 PDT 2022
Author: Simon Pilgrim
Date: 2022-05-06T11:41:55+01:00
New Revision: d21bf514940fd3b6368796e3ad22e1910c8598c6
URL: https://github.com/llvm/llvm-project/commit/d21bf514940fd3b6368796e3ad22e1910c8598c6
DIFF: https://github.com/llvm/llvm-project/commit/d21bf514940fd3b6368796e3ad22e1910c8598c6.diff
LOG: [CostModel][X86] Adjust pre-SSE41 fp scalar select costs to account for vector ops
Based off the script from D103695, we now mainly use BLENDV or OR(AND,ANDN) to select scalar float/double ops
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/Analysis/CostModel/X86/vselect-cost.ll
llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6bb79abc8a8e..ca54e1b86b5b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2753,7 +2753,9 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
static const CostTblEntry SSE41CostTbl[] = {
{ ISD::SELECT, MVT::v2f64, 1 }, // blendvpd
+ { ISD::SELECT, MVT::f64, 1 }, // blendvpd
{ ISD::SELECT, MVT::v4f32, 1 }, // blendvps
+ { ISD::SELECT, MVT::f32 , 1 }, // blendvps
{ ISD::SELECT, MVT::v2i64, 1 }, // pblendvb
{ ISD::SELECT, MVT::v4i32, 1 }, // pblendvb
{ ISD::SELECT, MVT::v8i16, 1 }, // pblendvb
@@ -2769,6 +2771,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::v16i8, 1 },
{ ISD::SELECT, MVT::v2f64, 2 }, // andpd + andnpd + orpd
+ { ISD::SELECT, MVT::f64, 2 }, // andpd + andnpd + orpd
{ ISD::SELECT, MVT::v2i64, 2 }, // pand + pandn + por
{ ISD::SELECT, MVT::v4i32, 2 }, // pand + pandn + por
{ ISD::SELECT, MVT::v8i16, 2 }, // pand + pandn + por
@@ -2780,6 +2783,7 @@ InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
{ ISD::SETCC, MVT::f32, 1 },
{ ISD::SELECT, MVT::v4f32, 2 }, // andps + andnps + orps
+ { ISD::SELECT, MVT::f32, 2 }, // andps + andnps + orps
};
if (ST->useSLMArithCosts())
diff --git a/llvm/test/Analysis/CostModel/X86/vselect-cost.ll b/llvm/test/Analysis/CostModel/X86/vselect-cost.ll
index 88e81ac21cc3..9c475f9d6fc5 100644
--- a/llvm/test/Analysis/CostModel/X86/vselect-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vselect-cost.ll
@@ -148,11 +148,11 @@ define i32 @test_select() {
define i32 @test_select_fp() {
; SSE2-LABEL: 'test_select_fp'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = select i1 undef, double undef, double undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = select i1 undef, double undef, double undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = select <8 x i1> undef, <8 x double> undef, <8 x double> undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = select i1 undef, float undef, float undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = select i1 undef, float undef, float undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = select <8 x i1> undef, <8 x float> undef, <8 x float> undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = select <16 x i1> undef, <16 x float> undef, <16 x float> undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
index 2e84c104adbb..ef1ed33ffcb9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -1,87 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s -check-prefix=SSE
-; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s -check-prefix=AVX
+; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -basic-aa -slp-vectorizer -S -mattr=+avx | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.10.0"
define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
-; SSE-LABEL: @testfunc(
-; SSE-NEXT: entry:
-; SSE-NEXT: br label [[FOR_BODY:%.*]]
-; SSE: for.body:
-; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
-; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; SSE-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
-; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP5]], i32 1
-; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP1]], i32 1
-; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP6]], [[TMP8]]
-; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP2]], [[TMP9]]
-; SSE-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
-; SSE-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
-; SSE-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
-; SSE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; SSE-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
-; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i32 0
-; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1
-; SSE-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; SSE-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], <float -1.000000e+00, float -1.000000e+00>
-; SSE-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP21]]
-; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
-; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; SSE: for.end:
-; SSE-NEXT: ret void
-;
-; AVX-LABEL: @testfunc(
-; AVX-NEXT: entry:
-; AVX-NEXT: br label [[FOR_BODY:%.*]]
-; AVX: for.body:
-; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
-; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
-; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]]
-; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
-; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]]
-; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
-; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
-; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
-; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
-; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
-; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
-; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
-; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
-; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; AVX: for.end:
-; AVX-NEXT: ret void
+; CHECK-LABEL: @testfunc(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]]
+; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1
+; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
;
entry:
br label %for.body
More information about the llvm-commits
mailing list