[llvm] 7f4ff78 - [x86] use vector instructions to lower even more FP->int->FP casts
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 25 08:40:00 PDT 2020
Author: Sanjay Patel
Date: 2020-04-25T11:38:54-04:00
New Revision: 7f4ff782d406390208b7845c15383941eb4e2dc8
URL: https://github.com/llvm/llvm-project/commit/7f4ff782d406390208b7845c15383941eb4e2dc8
DIFF: https://github.com/llvm/llvm-project/commit/7f4ff782d406390208b7845c15383941eb4e2dc8.diff
LOG: [x86] use vector instructions to lower even more FP->int->FP casts
This is another enhancement to D77895/D78362
to avoid a round-trip from XMM->GPR->XMM.
This time we handle the case of starting/ending with different FP types
but always with signed i32 as the intermediate value.
I think this covers all of the faux vector optimization possibilities
for pre-AVX512.
There is at least 1 other transform mentioned in PR36617:
https://bugs.llvm.org/show_bug.cgi?id=36617#c19
...where we fold an 'fpext' into a preceding 'sitofp'. I think we will
want to handle that earlier (DAGCombiner or instcombine) because that's
a target-independent optimization.
Differential Revision: https://reviews.llvm.org/D78758
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/ftrunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1576ac639dd2..44552eb706a7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19181,27 +19181,28 @@ static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
MVT IntVT = CastToInt.getSimpleValueType();
SDValue X = CastToInt.getOperand(0);
- // TODO: Allow size-changing from source to dest (double -> i32 -> float)
- if (X.getSimpleValueType() != VT)
+ MVT SrcVT = X.getSimpleValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
return SDValue();
// See if we have 128-bit vector cast instructions for this type of cast.
- // We need cvttps2dq + cvtdq2ps or cvttpd2dq + cvtdq2pd.
+ // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
IntVT != MVT::i32)
return SDValue();
- unsigned NumFPEltsInXMM = 128 / VT.getScalarSizeInBits();
- unsigned NumIntEltsInXMM = 128 / IntVT.getScalarSizeInBits();
- MVT VecFPVT = MVT::getVectorVT(VT, NumFPEltsInXMM);
- MVT VecIntVT = MVT::getVectorVT(IntVT, NumIntEltsInXMM);
+ unsigned SrcSize = SrcVT.getSizeInBits();
+ unsigned IntSize = IntVT.getSizeInBits();
+ unsigned VTSize = VT.getSizeInBits();
+ MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
+ MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
- bool NeedX86Opcodes = VT.getSizeInBits() != IntVT.getSizeInBits();
unsigned ToIntOpcode =
- NeedX86Opcodes ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
+ SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
unsigned ToFPOpcode =
- NeedX86Opcodes ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
+ IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
//
@@ -19211,9 +19212,9 @@ static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
// penalties) with cast ops.
SDLoc DL(CastToFP);
SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
- SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecFPVT, X);
+ SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
- SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecFPVT, VCastToInt);
+ SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
}
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index d3179f0a3827..fa7fd2b22acb 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -301,15 +301,14 @@ define double @trunc_signed32_f64_nsz(double %x) #0 {
define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math:
; SSE: # %bb.0:
-; SSE-NEXT: cvttss2si %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2sd %eax, %xmm0
+; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math:
; AVX1: # %bb.0:
-; AVX1-NEXT: vcvttss2si %xmm0, %eax
-; AVX1-NEXT: vcvtsi2sd %eax, %xmm1, %xmm0
+; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-NEXT: retq
%i = fptosi float %x to i32
%r = sitofp i32 %i to double
@@ -319,15 +318,14 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
define double @trunc_f32_signed32_f64_nsz(float %x) #0 {
; SSE-LABEL: trunc_f32_signed32_f64_nsz:
; SSE: # %bb.0:
-; SSE-NEXT: cvttss2si %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2sd %eax, %xmm0
+; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_f32_signed32_f64_nsz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vcvttss2si %xmm0, %eax
-; AVX1-NEXT: vcvtsi2sd %eax, %xmm1, %xmm0
+; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX1-NEXT: retq
%i = fptosi float %x to i32
%r = sitofp i32 %i to double
@@ -337,15 +335,14 @@ define double @trunc_f32_signed32_f64_nsz(float %x) #0 {
define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math:
; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ss %eax, %xmm0
+; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math:
; AVX1: # %bb.0:
-; AVX1-NEXT: vcvttsd2si %xmm0, %eax
-; AVX1-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0
+; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX1-NEXT: retq
%i = fptosi double %x to i32
%r = sitofp i32 %i to float
@@ -355,15 +352,14 @@ define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
define float @trunc_f64_signed32_f32_nsz(double %x) #0 {
; SSE-LABEL: trunc_f64_signed32_f32_nsz:
; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ss %eax, %xmm0
+; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_f64_signed32_f32_nsz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vcvttsd2si %xmm0, %eax
-; AVX1-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0
+; AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX1-NEXT: retq
%i = fptosi double %x to i32
%r = sitofp i32 %i to float
More information about the llvm-commits
mailing list