[llvm] r375341 - [X86][SSE] LowerUINT_TO_FP_i64 - only use HADDPD for size/fast-hops
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 19 04:53:48 PDT 2019
Author: rksimon
Date: Sat Oct 19 04:53:48 2019
New Revision: 375341
URL: http://llvm.org/viewvc/llvm-project?rev=375341&view=rev
Log:
[X86][SSE] LowerUINT_TO_FP_i64 - only use HADDPD for size/fast-hops
We were always generating a single source HADDPD, but really we should only do this if shouldUseHorizontalOp says its a good idea.
Differential Revision: https://reviews.llvm.org/D69175
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/haddsub-3.ll
llvm/trunk/test/CodeGen/X86/haddsub-broadcast.ll
llvm/trunk/test/CodeGen/X86/scalar-int-to-fp.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=375341&r1=375340&r2=375341&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat Oct 19 04:53:48 2019
@@ -18510,6 +18510,16 @@ SDValue X86TargetLowering::BuildFILD(SDV
return Result;
}
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+ return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -18564,8 +18574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDVal
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3()) {
- // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
+ if (shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
@@ -19623,16 +19632,6 @@ static SDValue LowerSTRICT_FP_ROUND(SDVa
return Op;
}
-/// Horizontal vector math instructions may be slower than normal math with
-/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
-/// implementation, and likely shuffle complexity of the alternate sequence.
-static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
- bool HasFastHOps = Subtarget.hasFastHorizontalOps();
- return !IsSingleSource || IsOptimizingSize || HasFastHOps;
-}
-
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll?rev=375341&r1=375340&r2=375341&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll Sat Oct 19 04:53:48 2019
@@ -1841,7 +1841,8 @@ define <2 x double> @test_mm_cvtu64_sd(<
; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
-; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/haddsub-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-3.ll?rev=375341&r1=375340&r2=375341&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-3.ll Sat Oct 19 04:53:48 2019
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
define float @pr26491(<4 x float> %a0) {
; SSE2-LABEL: pr26491:
@@ -58,37 +60,68 @@ define <4 x double> @PR41414(i64 %x, <4
; SSE2-NEXT: addpd %xmm2, %xmm1
; SSE2-NEXT: retq
;
-; SSSE3-LABEL: PR41414:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq %rdi, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
-; SSSE3-NEXT: subpd {{.*}}(%rip), %xmm2
-; SSSE3-NEXT: haddpd %xmm2, %xmm2
-; SSSE3-NEXT: divpd %xmm2, %xmm1
-; SSSE3-NEXT: divpd %xmm2, %xmm0
-; SSSE3-NEXT: xorpd %xmm2, %xmm2
-; SSSE3-NEXT: addpd %xmm2, %xmm0
-; SSSE3-NEXT: addpd %xmm2, %xmm1
-; SSSE3-NEXT: retq
+; SSSE3-SLOW-LABEL: PR41414:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: movq %rdi, %xmm2
+; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2
+; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3
+; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0]
+; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0
+; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0
+; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: PR41414:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: movq %rdi, %xmm2
+; SSSE3-FAST-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSSE3-FAST-NEXT: subpd {{.*}}(%rip), %xmm2
+; SSSE3-FAST-NEXT: haddpd %xmm2, %xmm2
+; SSSE3-FAST-NEXT: divpd %xmm2, %xmm1
+; SSSE3-FAST-NEXT: divpd %xmm2, %xmm0
+; SSSE3-FAST-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0
+; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
+; SSSE3-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: PR41414:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1
+; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: retq
;
-; AVX1-LABEL: PR41414:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %rdi, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
+; AVX1-FAST-LABEL: PR41414:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vmovq %rdi, %xmm1
+; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: PR41414:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
Modified: llvm/trunk/test/CodeGen/X86/haddsub-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-broadcast.ll?rev=375341&r1=375340&r2=375341&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-broadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-broadcast.ll Sat Oct 19 04:53:48 2019
@@ -9,7 +9,8 @@ define <4 x double> @PR43402(i64 %x) {
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
-; CHECK-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
; CHECK-NEXT: retl
%conv = uitofp i64 %x to double
Modified: llvm/trunk/test/CodeGen/X86/scalar-int-to-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scalar-int-to-fp.ll?rev=375341&r1=375340&r2=375341&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scalar-int-to-fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scalar-int-to-fp.ll Sat Oct 19 04:53:48 2019
@@ -610,8 +610,9 @@ define double @u64_to_d(i64 %a) nounwind
; AVX512F_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F_32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512F_32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX512F_32-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX512F_32-NEXT: vmovlpd %xmm0, (%esp)
+; AVX512F_32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512F_32-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX512F_32-NEXT: vmovsd %xmm0, (%esp)
; AVX512F_32-NEXT: fldl (%esp)
; AVX512F_32-NEXT: movl %ebp, %esp
; AVX512F_32-NEXT: popl %ebp
More information about the llvm-commits
mailing list