[PATCH][X86] Fix for a a poor code generation bug affecting addss/mulss and other SSE scalar fp arithmetic instructions
Andrea Di Biagio
andrea.dibiagio at gmail.com
Fri Dec 6 13:40:15 PST 2013
Hi,
This patch fixes a poor code generation bug affecting SSE scalar fp
instructions like addss/mulss.
The problem has been originally reported here:
http://comments.gmane.org/gmane.comp.compilers.llvm.devel/68542
At the moment, the x86 backend tends to emit unnecessary vector insert
instructions immediately after sse scalar fp instructions.
Example:
/////////////////////////////////
__m128 foo(__m128 A, __m128 B) {
A[0] += B[0];
return A;
}
/////////////////////////////////
produces the following sequence:
addss %xmm0, %xmm1
movss %xmm1, %xmm0
Instead of:
addss %xmm1, %xmm0
This patch addresses the problem at ISel stage introducing a target
specific combine rule to fold patterns like this one:
a0 : f32 = extract_vector_elt ( A, 0)
b0 : f32 = extract_vector_elt ( B, 0)
r0 : f32 = fadd a0, b0
result : v4f32 = insert_vector_elt ( A, r0, 0 )
into a single 'addss' instruction.
Please let me know what you think.
Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: test/CodeGen/X86/sse-scalar-fp-arith.ll
===================================================================
--- test/CodeGen/X86/sse-scalar-fp-arith.ll (revision 0)
+++ test/CodeGen/X86/sse-scalar-fp-arith.ll (revision 0)
@@ -0,0 +1,225 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %a, i32 0
+ %add = fadd float %2, %1
+ %3 = insertelement <4 x float> %a, float %add, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss %xmm1, %xmm0
+; AVX: vaddss %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %a, i32 0
+ %sub = fsub float %2, %1
+ %3 = insertelement <4 x float> %a, float %sub, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss %xmm1, %xmm0
+; AVX: vsubss %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %a, i32 0
+ %mul = fmul float %2, %1
+ %3 = insertelement <4 x float> %a, float %mul, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss %xmm1, %xmm0
+; AVX: vmulss %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %a, i32 0
+ %div = fdiv float %2, %1
+ %3 = insertelement <4 x float> %a, float %div, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss %xmm1, %xmm0
+; AVX: vdivss %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %b, i32 0
+ %2 = extractelement <2 x double> %a, i32 0
+ %add = fadd double %2, %1
+ %3 = insertelement <2 x double> %a, double %add, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd %xmm1, %xmm0
+; AVX: vaddsd %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %b, i32 0
+ %2 = extractelement <2 x double> %a, i32 0
+ %sub = fsub double %2, %1
+ %3 = insertelement <2 x double> %a, double %sub, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd %xmm1, %xmm0
+; AVX: vsubsd %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %b, i32 0
+ %2 = extractelement <2 x double> %a, i32 0
+ %mul = fmul double %2, %1
+ %3 = insertelement <2 x double> %a, double %mul, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd %xmm1, %xmm0
+; AVX: vmulsd %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %b, i32 0
+ %2 = extractelement <2 x double> %a, i32 0
+ %div = fdiv double %2, %1
+ %3 = insertelement <2 x double> %a, double %div, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd %xmm1, %xmm0
+; AVX: vdivsd %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %a, i32 0
+ %2 = extractelement <4 x float> %b, i32 0
+ %add = fadd float %1, %2
+ %3 = insertelement <4 x float> %b, float %add, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss %xmm0, %xmm1
+; AVX: vaddss %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %a, i32 0
+ %2 = extractelement <4 x float> %b, i32 0
+ %sub = fsub float %2, %1
+ %3 = insertelement <4 x float> %b, float %sub, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss %xmm0, %xmm1
+; AVX: vsubss %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %a, i32 0
+ %2 = extractelement <4 x float> %b, i32 0
+ %mul = fmul float %1, %2
+ %3 = insertelement <4 x float> %b, float %mul, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss %xmm0, %xmm1
+; AVX: vmulss %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = extractelement <4 x float> %a, i32 0
+ %2 = extractelement <4 x float> %b, i32 0
+ %div = fdiv float %2, %1
+ %3 = insertelement <4 x float> %b, float %div, i32 0
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss %xmm0, %xmm1
+; AVX: vdivss %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %a, i32 0
+ %2 = extractelement <2 x double> %b, i32 0
+ %add = fadd double %1, %2
+ %3 = insertelement <2 x double> %b, double %add, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd %xmm0, %xmm1
+; AVX: vaddsd %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %a, i32 0
+ %2 = extractelement <2 x double> %b, i32 0
+ %sub = fsub double %2, %1
+ %3 = insertelement <2 x double> %b, double %sub, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd %xmm0, %xmm1
+; AVX: vsubsd %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %a, i32 0
+ %2 = extractelement <2 x double> %b, i32 0
+ %mul = fmul double %1, %2
+ %3 = insertelement <2 x double> %b, double %mul, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd %xmm0, %xmm1
+; AVX: vmulsd %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = extractelement <2 x double> %a, i32 0
+ %2 = extractelement <2 x double> %b, i32 0
+ %div = fdiv double %2, %1
+ %3 = insertelement <2 x double> %b, double %div, i32 0
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd %xmm0, %xmm1
+; AVX: vdivsd %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
Index: lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- lib/Target/X86/X86InstrFragmentsSIMD.td (revision 196599)
+++ lib/Target/X86/X86InstrFragmentsSIMD.td (working copy)
@@ -35,6 +35,11 @@
def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+def X86fadds : SDNode<"X86ISD::FADDS", SDTFPBinOp>;
+def X86fsubs : SDNode<"X86ISD::FSUBS", SDTFPBinOp>;
+def X86fmuls : SDNode<"X86ISD::FMULS", SDTFPBinOp>;
+def X86fdivs : SDNode<"X86ISD::FDIVS", SDTFPBinOp>;
+
// Commutative and Associative FMIN and FMAX.
def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 196599)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -1527,6 +1527,7 @@
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
computeRegisterProperties();
@@ -13906,6 +13907,10 @@
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
+ case X86ISD::FADDS: return "X86ISD::FADDS";
+ case X86ISD::FSUBS: return "X86ISD::FSUBS";
+ case X86ISD::FMULS: return "X86ISD::FMULS";
+ case X86ISD::FDIVS: return "X86ISD::FDIVS";
}
}
@@ -19165,6 +19170,74 @@
In.getOperand(0));
}
+// Helper function used by 'PerformINSERT_VECTOR_ELTCombine'
+// to match a EXTRACT_VECTOR_ELT( V, 0 ).
+static bool isExtractElementZero(const SDValue &V) {
+ return ((V.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
+ X86::isZeroNode(V.getOperand(1)));
+}
+
+/// PeformINSERT_VECTOR_ELTCombine - Performs insert vector combines.
+/// Try to synthesize SSE1/SSE2 single or double precision fp arithmetic from
+/// complex patterns involving scalar floating point arithmetic instructions
+/// and insert/extract vector element.
+static SDValue PerformINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!(((VT == MVT::v4f32) && Subtarget->hasSSE1()) ||
+ ((VT == MVT::v2f64) && Subtarget->hasSSE2())))
+ return SDValue();
+
+ if (!X86::isZeroNode(N->getOperand(2)))
+ return SDValue();
+
+ SDValue Src = N->getOperand(1);
+ int Opcode = Src->getOpcode();
+ if ((Opcode != ISD::FADD) && (Opcode != ISD::FSUB) &&
+ (Opcode != ISD::FMUL) && (Opcode != ISD::FDIV))
+ return SDValue();
+
+ // try to fold
+ // (insert_vector_elt VT:$dest,
+ // (ISD::Opcode (extract_vector_elt VT:$dest, 0),
+ // (extract_vector_elt VT:$src2, 0))) ->
+ // (X86ISD::#Opcode#S (VT:$dest, VT:$src2))
+ SDValue Op0 = Src->getOperand(0);
+ SDValue Op1 = Src->getOperand(1);
+ SDValue Dest = N->getOperand(0);
+
+ if (!(isExtractElementZero(Op0) && isExtractElementZero(Op1)))
+ return SDValue();
+
+ if (Op0->getOperand(0) != Dest) {
+ if ((Opcode != ISD::FADD) && (Opcode != ISD::FMUL))
+ // Cannot commute the operands.
+ return SDValue();
+
+ if (Op1->getOperand(0) != Dest)
+ return SDValue();
+
+ // We can still match the pattern by
+ // commuting the operands of the FADD/FMUL.
+ Op0 = Src->getOperand(1);
+ Op1 = Src->getOperand(0);
+ }
+
+ Op1 = Op1->getOperand(0);
+ if (Op1->getValueType(0) != VT)
+ return SDValue();
+
+ switch(Opcode) {
+ default : llvm_unreachable(0);
+ case ISD::FADD: Opcode = X86ISD::FADDS; break;
+ case ISD::FMUL: Opcode = X86ISD::FMULS; break;
+ case ISD::FSUB: Opcode = X86ISD::FSUBS; break;
+ case ISD::FDIV: Opcode = X86ISD::FDIVS; break;
+ }
+
+ return DAG.getNode(Opcode, SDLoc(N), VT, Dest, Op1);
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -19222,6 +19295,8 @@
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
+ case ISD::INSERT_VECTOR_ELT:
+ return PerformINSERT_VECTOR_ELTCombine(N, DAG, Subtarget);
}
return SDValue();
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td (revision 196599)
+++ lib/Target/X86/X86InstrSSE.td (working copy)
@@ -3017,6 +3017,45 @@
basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
}
+def : Pat<(v4f32 (X86fadds (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+def : Pat<(v4f32 (X86fsubs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+def : Pat<(v4f32 (X86fmuls (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+def : Pat<(v4f32 (X86fdivs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2f64 (X86fadds (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86fsubs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86fmuls (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86fdivs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
+let AddedComplexity = 20, Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86fadds (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86fsubs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86fmuls (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86fdivs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+ (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v2f64 (X86fadds (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86fsubs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86fmuls (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86fdivs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+ (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
/// represent the associated intrinsic operation. This form is unlike the
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h (revision 196599)
+++ lib/Target/X86/X86ISelLowering.h (working copy)
@@ -292,6 +292,11 @@
ADD, SUB, ADC, SBB, SMUL,
INC, DEC, OR, XOR, AND,
+ // Scalar single/double precision floating point arithmetic operations.
+ // These operators work on vector types and are not commutative. They
+ // perform both the named operation and an implicit BLEND.
+ FADDS, FSUBS, FMULS, FDIVS,
+
BLSI, // BLSI - Extract lowest set isolated bit
BLSMSK, // BLSMSK - Get mask up to lowest set bit
BLSR, // BLSR - Reset lowest set bit
More information about the llvm-commits
mailing list