[llvm] a135c4a - [X86] Don't scalarize v2f32->v2i64 strict_fp_to_sint/uint with avx512dq and not avx512vl.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 7 14:46:25 PDT 2020
Author: Craig Topper
Date: 2020-06-07T14:45:26-07:00
New Revision: a135c4a2cf7197dd3140f610f51b596d7334bae2
URL: https://github.com/llvm/llvm-project/commit/a135c4a2cf7197dd3140f610f51b596d7334bae2
DIFF: https://github.com/llvm/llvm-project/commit/a135c4a2cf7197dd3140f610f51b596d7334bae2.diff
LOG: [X86] Don't scalarize v2f32->v2i64 strict_fp_to_sint/uint with avx512dq and not avx512vl.
We can pad the v2f32 with 0s up to v8f32 and use a v8f32->v8i64
operation. This is what we end up with on non-strict nodes except
we don't pad with 0s since we don't care about exceptions.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6638a8f32003..7db874e0e8e0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1719,6 +1719,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
+ if (Subtarget.hasDQI()) {
+ // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
+ // v2f32 UINT_TO_FP is already custom under SSE2.
+ assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
+ "Unexpected operation action!");
+ // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
+ }
+
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1838,19 +1851,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
- if (Subtarget.hasDQI()) {
- // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
- // v2f32 UINT_TO_FP is already custom under SSE2.
- assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
- isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
- "Unexpected operation action!");
- // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
- }
-
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
@@ -20717,6 +20717,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
+ if (!Subtarget.hasVLX()) {
+ // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
+ // legalizer and then widened again by vector op legalization.
+ if (!IsStrict)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
+ {Src, Zero, Zero, Zero});
+ Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Tmp});
+ SDValue Chain = Tmp.getValue(1);
+ Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp, Chain}, dl);
+ return Tmp;
+ }
+
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index af9663d7798f..0c706b36e432 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -685,26 +685,14 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-64-NEXT: retq
;
-; AVX512DQ-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
-; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[1],zero,zero,zero
-; AVX512DQ-32-NEXT: vcvttps2qq %ymm1, %zmm1
-; AVX512DQ-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512DQ-32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX512DQ-32-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-32-NEXT: vzeroupper
-; AVX512DQ-32-NEXT: retl
-;
-; AVX512DQ-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
-; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-64-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512DQ-64-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-64-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-64-NEXT: retq
+; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: ret{{[l|q]}}
;
; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i64:
; AVX512VLDQ: # %bb.0:
@@ -1016,26 +1004,14 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-64-NEXT: retq
;
-; AVX512DQ-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
-; AVX512DQ-32: # %bb.0:
-; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[1],zero,zero,zero
-; AVX512DQ-32-NEXT: vcvttps2uqq %ymm1, %zmm1
-; AVX512DQ-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512DQ-32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX512DQ-32-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-32-NEXT: vzeroupper
-; AVX512DQ-32-NEXT: retl
-;
-; AVX512DQ-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
-; AVX512DQ-64: # %bb.0:
-; AVX512DQ-64-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512DQ-64-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512DQ-64-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512DQ-64-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-64-NEXT: retq
+; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: ret{{[l|q]}}
;
; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
; AVX512VLDQ: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 8ea92294fc03..22e97e740753 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -3976,14 +3976,30 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() #0 {
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
-; AVX-LABEL: constrained_vector_fptosi_v2i64_v2f32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f32:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: retq
+;
+; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f32:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: constrained_vector_fptosi_v2i64_v2f32:
+; AVX512DQ: # %bb.0: # %entry
+; AVX512DQ-NEXT: vcvttps2qq {{.*}}(%rip), %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(
<2 x float><float 42.0, float 43.0>,
@@ -4588,14 +4604,21 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: retq
;
-; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: retq
+; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f32:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: constrained_vector_fptoui_v2i64_v2f32:
+; AVX512DQ: # %bb.0: # %entry
+; AVX512DQ-NEXT: vcvttps2uqq {{.*}}(%rip), %zmm0
+; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(
<2 x float><float 42.0, float 43.0>,
More information about the llvm-commits
mailing list