[llvm] r357760 - [DAGCombiner][x86] scalarize splatted vector FP ops
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 5 06:32:18 PDT 2019
Author: spatel
Date: Fri Apr 5 06:32:17 2019
New Revision: 357760
URL: http://llvm.org/viewvc/llvm-project?rev=357760&view=rev
Log:
[DAGCombiner][x86] scalarize splatted vector FP ops
There are a variety of vector patterns that may be profitably reduced to a
scalar op when scalar ops are performed using a subset (typically, the
first lane) of the vector register file.
For x86, this is true for float/double ops and element 0 because
insert/extract is just a sub-register rename.
Other targets should likely enable the hook in a similar way.
Differential Revision: https://reviews.llvm.org/D60150
Modified:
llvm/trunk/include/llvm/CodeGen/TargetLowering.h
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h
llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll
llvm/trunk/test/CodeGen/X86/scalarize-fp.ll
Modified: llvm/trunk/include/llvm/CodeGen/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/TargetLowering.h?rev=357760&r1=357759&r2=357760&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h Fri Apr 5 06:32:17 2019
@@ -2441,6 +2441,14 @@ public:
return false;
}
+ /// Return true if extraction of a scalar element from the given vector type
+ /// at the given index is cheap. For example, if scalar operations occur on
+ /// the same register file as vector operations, then an extract element may
+ /// be a sub-register rename rather than an actual instruction.
+ virtual bool isExtractVecEltCheap(EVT VT, unsigned Index) const {
+ return false;
+ }
+
/// Try to convert math with an overflow comparison into the corresponding DAG
/// node operation. Targets may want to override this independently of whether
/// the operation is legal/custom for the given type because it may obscure
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=357760&r1=357759&r2=357760&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Fri Apr 5 06:32:17 2019
@@ -18078,11 +18078,28 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE
// If it is a splat, check if the argument vector is another splat or a
// build_vector.
if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
- SDNode *V = N0.getNode();
+ int SplatIndex = SVN->getSplatIndex();
+ if (TLI.isExtractVecEltCheap(VT, SplatIndex) &&
+ ISD::isBinaryOp(N0.getNode())) {
+ // splat (vector_bo L, R), Index -->
+ // splat (scalar_bo (extelt L, Index), (extelt R, Index))
+ SDValue L = N0.getOperand(0), R = N0.getOperand(1);
+ SDLoc DL(N);
+ EVT EltVT = VT.getScalarType();
+ SDValue Index = DAG.getIntPtrConstant(SplatIndex, DL);
+ SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
+ SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
+ SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
+ N0.getNode()->getFlags());
+ SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
+ SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
+ return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
+ }
// If this is a bit convert that changes the element type of the vector but
// not the number of vector elements, look through it. Be careful not to
// look though conversions that change things like v4f32 to v2f64.
+ SDNode *V = N0.getNode();
if (V->getOpcode() == ISD::BITCAST) {
SDValue ConvInput = V->getOperand(0);
if (ConvInput.getValueType().isVector() &&
@@ -18115,7 +18132,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE
return N0;
// Canonicalize any other splat as a build_vector.
- const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
+ SDValue Splatted = V->getOperand(SplatIndex);
SmallVector<SDValue, 8> Ops(NumElts, Splatted);
SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=357760&r1=357759&r2=357760&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Fri Apr 5 06:32:17 2019
@@ -1074,6 +1074,12 @@ namespace llvm {
/// supported.
bool shouldScalarizeBinop(SDValue) const override;
+ /// Extract of a scalar FP value from index 0 of a vector is free.
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+ }
+
/// Overflow nodes should get combined/lowered to optimal instructions
/// (they should allow eliminating explicit compares by getting flags from
/// math ops).
Modified: llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll?rev=357760&r1=357759&r2=357760&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll Fri Apr 5 06:32:17 2019
@@ -274,7 +274,7 @@ define <2 x double> @hadd_v2f64(<2 x dou
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
;
@@ -286,7 +286,7 @@ define <2 x double> @hadd_v2f64(<2 x dou
; AVX1_SLOW-LABEL: hadd_v2f64:
; AVX1_SLOW: # %bb.0:
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1_SLOW-NEXT: retq
;
@@ -298,7 +298,7 @@ define <2 x double> @hadd_v2f64(<2 x dou
; AVX2_SLOW-LABEL: hadd_v2f64:
; AVX2_SLOW: # %bb.0:
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2_SLOW-NEXT: retq
;
@@ -398,12 +398,12 @@ define <4 x double> @hadd_v4f64(<4 x dou
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
-; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3
-; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
-; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
; SSSE3_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v4f64:
@@ -447,7 +447,7 @@ define <2 x double> @hsub_v2f64(<2 x dou
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3_SLOW-NEXT: retq
;
@@ -459,7 +459,7 @@ define <2 x double> @hsub_v2f64(<2 x dou
; AVX1_SLOW-LABEL: hsub_v2f64:
; AVX1_SLOW: # %bb.0:
; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1_SLOW-NEXT: retq
;
@@ -471,7 +471,7 @@ define <2 x double> @hsub_v2f64(<2 x dou
; AVX2_SLOW-LABEL: hsub_v2f64:
; AVX2_SLOW: # %bb.0:
; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2_SLOW-NEXT: retq
;
@@ -491,11 +491,11 @@ define <4 x double> @hsub_v4f64(<4 x dou
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
-; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1
-; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/scalarize-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/scalarize-fp.ll?rev=357760&r1=357759&r2=357760&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/scalarize-fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/scalarize-fp.ll Fri Apr 5 06:32:17 2019
@@ -379,13 +379,13 @@ define <4 x double> @load_fdiv_op0_const
define <2 x double> @fadd_splat_splat_v2f64(<2 x double> %vx, <2 x double> %vy) {
; SSE-LABEL: fadd_splat_splat_v2f64:
; SSE: # %bb.0:
-; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: fadd_splat_splat_v2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%splatx = shufflevector <2 x double> %vx, <2 x double> undef, <2 x i32> zeroinitializer
@@ -397,14 +397,14 @@ define <2 x double> @fadd_splat_splat_v2
define <4 x double> @fsub_splat_splat_v4f64(double %x, double %y) {
; SSE-LABEL: fsub_splat_splat_v4f64:
; SSE: # %bb.0:
-; SSE-NEXT: subpd %xmm1, %xmm0
+; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: fsub_splat_splat_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -419,13 +419,13 @@ define <4 x double> @fsub_splat_splat_v4
define <4 x float> @fmul_splat_splat_v4f32(<4 x float> %vx, <4 x float> %vy) {
; SSE-LABEL: fmul_splat_splat_v4f32:
; SSE: # %bb.0:
-; SSE-NEXT: mulps %xmm1, %xmm0
+; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: fmul_splat_splat_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer
@@ -437,26 +437,14 @@ define <4 x float> @fmul_splat_splat_v4f
define <8 x float> @fdiv_splat_splat_v8f32(<8 x float> %vx, <8 x float> %vy) {
; SSE-LABEL: fdiv_splat_splat_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm2, %xmm3
-; SSE-NEXT: mulps %xmm3, %xmm2
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: subps %xmm2, %xmm1
-; SSE-NEXT: mulps %xmm3, %xmm1
-; SSE-NEXT: addps %xmm3, %xmm1
-; SSE-NEXT: mulps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: divss %xmm2, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: fdiv_splat_splat_v8f32:
; AVX: # %bb.0:
-; AVX-NEXT: vrcpps %ymm1, %ymm2
-; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -569,7 +557,7 @@ define <4 x double> @fsub_const_op0_spla
; SSE-LABEL: fsub_const_op0_splat_v4f64:
; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: subpd %xmm0, %xmm1
+; SSE-NEXT: subsd %xmm0, %xmm1
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -577,7 +565,7 @@ define <4 x double> @fsub_const_op0_spla
; AVX-LABEL: fsub_const_op0_splat_v4f64:
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -667,13 +655,13 @@ define <8 x float> @fdiv_const_op1_splat
define <2 x double> @splat0_fadd_v2f64(<2 x double> %vx, <2 x double> %vy) {
; SSE-LABEL: splat0_fadd_v2f64:
; SSE: # %bb.0:
-; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fadd_v2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%b = fadd <2 x double> %vx, %vy
@@ -684,14 +672,14 @@ define <2 x double> @splat0_fadd_v2f64(<
define <4 x double> @splat0_fsub_v4f64(double %x, double %y) {
; SSE-LABEL: splat0_fsub_v4f64:
; SSE: # %bb.0:
-; SSE-NEXT: subpd %xmm1, %xmm0
+; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fsub_v4f64:
; AVX: # %bb.0:
-; AVX-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -705,13 +693,13 @@ define <4 x double> @splat0_fsub_v4f64(d
define <4 x float> @splat0_fmul_v4f32(<4 x float> %vx, <4 x float> %vy) {
; SSE-LABEL: splat0_fmul_v4f32:
; SSE: # %bb.0:
-; SSE-NEXT: mulps %xmm1, %xmm0
+; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fmul_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%b = fmul fast <4 x float> %vx, %vy
@@ -722,26 +710,14 @@ define <4 x float> @splat0_fmul_v4f32(<4
define <8 x float> @splat0_fdiv_v8f32(<8 x float> %vx, <8 x float> %vy) {
; SSE-LABEL: splat0_fdiv_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm2, %xmm3
-; SSE-NEXT: mulps %xmm3, %xmm2
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: subps %xmm2, %xmm1
-; SSE-NEXT: mulps %xmm3, %xmm1
-; SSE-NEXT: addps %xmm3, %xmm1
-; SSE-NEXT: mulps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: divss %xmm2, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fdiv_v8f32:
; AVX: # %bb.0:
-; AVX-NEXT: vrcpps %ymm1, %ymm2
-; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -753,16 +729,13 @@ define <8 x float> @splat0_fdiv_v8f32(<8
define <2 x double> @splat0_fadd_const_op1_v2f64(<2 x double> %vx) {
; SSE-LABEL: splat0_fadd_const_op1_v2f64:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: addpd %xmm0, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addsd {{.*}}(%rip), %xmm0
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fadd_const_op1_v2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%b = fadd <2 x double> %vx, <double 42.0, double 12.0>
@@ -774,7 +747,7 @@ define <4 x double> @splat0_fsub_const_o
; SSE-LABEL: splat0_fsub_const_op0_v4f64:
; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: subpd %xmm0, %xmm1
+; SSE-NEXT: subsd %xmm0, %xmm1
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -782,7 +755,7 @@ define <4 x double> @splat0_fsub_const_o
; AVX-LABEL: splat0_fsub_const_op0_v4f64:
; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -795,16 +768,13 @@ define <4 x double> @splat0_fsub_const_o
define <4 x float> @splat0_fmul_const_op1_v4f32(<4 x float> %vx) {
; SSE-LABEL: splat0_fmul_const_op1_v4f32:
; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: mulps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: mulss {{.*}}(%rip), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fmul_const_op1_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: retq
%b = fmul fast <4 x float> %vx, <float 6.0, float -1.0, float 1.0, float 7.0>
@@ -821,13 +791,6 @@ define <8 x float> @splat0_fdiv_const_op
;
; AVX-LABEL: splat0_fdiv_const_op1_v8f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vrcpps %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -839,24 +802,16 @@ define <8 x float> @splat0_fdiv_const_op
define <8 x float> @splat0_fdiv_const_op0_v8f32(<8 x float> %vx) {
; SSE-LABEL: splat0_fdiv_const_op0_v8f32:
; SSE: # %bb.0:
-; SSE-NEXT: rcpps %xmm0, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm0
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: subps %xmm0, %xmm1
-; SSE-NEXT: mulps %xmm2, %xmm1
-; SSE-NEXT: addps %xmm2, %xmm1
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: divss %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splat0_fdiv_const_op0_v8f32:
; AVX: # %bb.0:
-; AVX-NEXT: vrcpps %ymm0, %ymm1
-; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: retq
More information about the llvm-commits
mailing list