[llvm] r353302 - [x86] vectorize cast ops in lowering to avoid register file transfers
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 6 06:59:39 PST 2019
Author: spatel
Date: Wed Feb 6 06:59:39 2019
New Revision: 353302
URL: http://llvm.org/viewvc/llvm-project?rev=353302&view=rev
Log:
[x86] vectorize cast ops in lowering to avoid register file transfers
The proposal in D56796 may cross the line because we're trying to avoid vectorization
transforms in generic DAG combining. So this is an alternate, later, x86-specific
translation of that patch.
There are several potential follow-ups to enhance this:
1. Allow extraction from non-zero element index.
2. Peek through extends of smaller width integers.
3. Support x86-specific conversion opcodes like X86ISD::CVTSI2P
Differential Revision: https://reviews.llvm.org/D56864
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/known-bits-vector.ll
llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll
llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=353302&r1=353301&r2=353302&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Feb 6 06:59:39 2019
@@ -17540,6 +17540,57 @@ static SDValue LowerI64IntToFP_AVX512DQ(
DAG.getIntPtrConstant(0, dl));
}
+static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
+ const X86Subtarget &Subtarget) {
+ switch (Opcode) {
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
+
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
+ return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
+ }
+}
+
+/// Given a scalar cast operation that is extracted from a vector, try to
+/// vectorize the cast op followed by extraction. This will avoid an expensive
+/// round-trip between XMM and GPR.
+static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: The limitation for extracting from the 0-element is not required,
+ // but if we extract from some other element, it will require shuffling to
+ // get the result into the right place.
+ // TODO: This could be enhanced to handle smaller integer types by peeking
+ // through an extend.
+ SDValue Extract = Cast.getOperand(0);
+ MVT DestVT = Cast.getSimpleValueType();
+ if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isNullConstant(Extract.getOperand(1)))
+ return SDValue();
+
+ SDValue VecOp = Extract.getOperand(0);
+ MVT FromVT = VecOp.getSimpleValueType();
+ MVT ToVT = MVT::getVectorVT(DestVT, FromVT.getVectorNumElements());
+ if (!useVectorCast(Cast.getOpcode(), FromVT, ToVT, Subtarget))
+ return SDValue();
+
+ // cast (extract V, Y) --> extract (cast V), Y
+ SDLoc DL(Cast);
+ SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
+ Extract.getOperand(1));
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
@@ -17547,6 +17598,9 @@ SDValue X86TargetLowering::LowerSINT_TO_
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
@@ -17909,6 +17963,9 @@ SDValue X86TargetLowering::LowerUINT_TO_
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
+ if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
+ return Extract;
+
MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType();
Modified: llvm/trunk/test/CodeGen/X86/known-bits-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/known-bits-vector.ll?rev=353302&r1=353301&r2=353302&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/known-bits-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/known-bits-vector.ll Wed Feb 6 06:59:39 2019
@@ -25,8 +25,7 @@ define float @knownbits_mask_extract_uit
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
Modified: llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll?rev=353302&r1=353301&r2=353302&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll Wed Feb 6 06:59:39 2019
@@ -92,8 +92,7 @@ define float @signbits_ashr_extract_sito
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@@ -120,8 +119,7 @@ define float @signbits_ashr_shl_extract_
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsllq $20, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@@ -152,8 +150,7 @@ define float @signbits_ashr_insert_ashr_
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; X32-NEXT: vpsrlq $3, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@@ -239,8 +236,7 @@ define float @signbits_ashr_sext_sextinr
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@@ -283,8 +279,7 @@ define float @signbits_ashr_sextvecinreg
; X32-NEXT: vpand %xmm1, %xmm0, %xmm2
; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
Modified: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll?rev=353302&r1=353301&r2=353302&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll Wed Feb 6 06:59:39 2019
@@ -5556,15 +5556,12 @@ define <4 x float> @sitofp_i64_to_4f32(<
define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32:
; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract0_sitofp_v4i32_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = sitofp i32 %e to float
@@ -5575,8 +5572,7 @@ define float @extract0_sitofp_v4i32_f32i
; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: incl %eax
; SSE-NEXT: cvtsi2ssl %eax, %xmm1
; SSE-NEXT: divss %xmm1, %xmm0
@@ -5585,7 +5581,7 @@ define float @extract0_sitofp_v4i32_f32i
; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: incl %eax
; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
@@ -5601,17 +5597,15 @@ define float @extract0_sitofp_v4i32_f32i
define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: cvtsi2ssl %eax, %xmm1
-; SSE-NEXT: movd %xmm0, (%rdi)
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
+; SSE-NEXT: movss %xmm0, (%rdi)
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1
-; AVX-NEXT: vmovd %xmm0, (%rdi)
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1
+; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
@@ -5630,8 +5624,7 @@ define double @extract0_sitofp_v4i32_f64
;
; AVX-LABEL: extract0_sitofp_v4i32_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = sitofp i32 %e to double
@@ -5652,11 +5645,31 @@ define float @extract0_uitofp_v4i32_f32(
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0
; VEX-NEXT: retq
;
-; AVX512-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = uitofp i32 %e to float
ret float %r
@@ -5676,11 +5689,35 @@ define double @extract0_uitofp_v4i32_f64
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0
; VEX-NEXT: retq
;
-; AVX512-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
+; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VLDQ-NEXT: vzeroupper
+; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0
%r = uitofp i32 %e to double
ret double %r
@@ -5692,9 +5729,7 @@ define float @extract3_sitofp_v4i32_f32(
; SSE2-LABEL: extract3_sitofp_v4i32_f32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract3_sitofp_v4i32_f32:
More information about the llvm-commits
mailing list