[PATCH][X86] Fix for a a poor code generation bug affecting addss/mulss and other SSE scalar fp arithmetic instructions

Fri Dec 6 13:40:15 PST 2013

Hi,

This patch fixes a poor code generation bug affecting SSE scalar fp
instructions like addss/mulss.
The problem has been originally reported here:
http://comments.gmane.org/gmane.comp.compilers.llvm.devel/68542

At the moment, the x86 backend tends to emit unnecessary vector insert
instructions immediately after sse scalar fp instructions.

Example:
/////////////////////////////////
__m128 foo(__m128 A, __m128 B) {
  A[0] += B[0];
  return A;
}
/////////////////////////////////

produces the following sequence:
  addss  %xmm0, %xmm1
  movss  %xmm1, %xmm0

Instead of:
  addss %xmm1, %xmm0

This patch addresses the problem at ISel stage introducing a target
specific combine rule to fold patterns like this one:

  a0 :     f32 = extract_vector_elt ( A, 0)
  b0 :     f32 = extract_vector_elt ( B, 0)
  r0 :     f32 = fadd a0, b0
  result : v4f32 = insert_vector_elt ( A, r0, 0 )

into a single 'addss' instruction.

Please let me know what you think.

Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: test/CodeGen/X86/sse-scalar-fp-arith.ll
===================================================================

--- test/CodeGen/X86/sse-scalar-fp-arith.ll	(revision 0)
+++ test/CodeGen/X86/sse-scalar-fp-arith.ll	(revision 0)
@@ -0,0 +1,225 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %3 = insertelement <4 x float> %a, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %a, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %3 = insertelement <4 x float> %a, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %a, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %add = fadd double %2, %1
+  %3 = insertelement <2 x double> %a, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %a, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %mul = fmul double %2, %1
+  %3 = insertelement <2 x double> %a, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %a, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %add = fadd float %1, %2
+  %3 = insertelement <4 x float> %b, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %b, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %mul = fmul float %1, %2
+  %3 = insertelement <4 x float> %b, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %b, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %add = fadd double %1, %2
+  %3 = insertelement <2 x double> %b, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %b, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %mul = fmul double %1, %2
+  %3 = insertelement <2 x double> %b, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %b, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK: ret
+
Index: lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- lib/Target/X86/X86InstrFragmentsSIMD.td	(revision 196599)
+++ lib/Target/X86/X86InstrFragmentsSIMD.td	(working copy)
@@ -35,6 +35,11 @@
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
 
+def X86fadds   : SDNode<"X86ISD::FADDS",     SDTFPBinOp>;
+def X86fsubs   : SDNode<"X86ISD::FSUBS",     SDTFPBinOp>;
+def X86fmuls   : SDNode<"X86ISD::FMULS",     SDTFPBinOp>;
+def X86fdivs   : SDNode<"X86ISD::FDIVS",     SDTFPBinOp>;
+
 // Commutative and Associative FMIN and FMAX.
 def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
     [SDNPCommutative, SDNPAssociative]>;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp	(revision 196599)
+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
@@ -1527,6 +1527,7 @@
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   computeRegisterProperties();
 
@@ -13906,6 +13907,10 @@
   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
+  case X86ISD::FADDS:              return "X86ISD::FADDS";
+  case X86ISD::FSUBS:              return "X86ISD::FSUBS";
+  case X86ISD::FMULS:              return "X86ISD::FMULS";
+  case X86ISD::FDIVS:              return "X86ISD::FDIVS";
   }
 }
 
@@ -19165,6 +19170,74 @@
                      In.getOperand(0));
 }
 
+// Helper function used by 'PerformINSERT_VECTOR_ELTCombine'
+// to match a EXTRACT_VECTOR_ELT( V, 0 ).
+static bool isExtractElementZero(const SDValue &V) {
+  return ((V.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
+          X86::isZeroNode(V.getOperand(1)));
+}
+
+/// PeformINSERT_VECTOR_ELTCombine - Performs insert vector combines.
+/// Try to synthesize SSE1/SSE2 single or double precision fp arithmetic from
+/// complex patterns involving scalar floating point arithmetic instructions
+/// and insert/extract vector element.
+static SDValue PerformINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+                                               const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!(((VT == MVT::v4f32) && Subtarget->hasSSE1()) || 
+        ((VT == MVT::v2f64) && Subtarget->hasSSE2())))
+    return SDValue();
+
+  if (!X86::isZeroNode(N->getOperand(2)))
+    return SDValue();
+
+  SDValue Src = N->getOperand(1);
+  int Opcode = Src->getOpcode();
+  if ((Opcode != ISD::FADD) && (Opcode != ISD::FSUB) &&
+      (Opcode != ISD::FMUL) && (Opcode != ISD::FDIV))
+    return SDValue();
+
+  // try to fold
+  //   (insert_vector_elt VT:$dest,
+  //     (ISD::Opcode (extract_vector_elt VT:$dest, 0),
+  //                  (extract_vector_elt VT:$src2, 0))) ->
+  //   (X86ISD::#Opcode#S (VT:$dest, VT:$src2))
+  SDValue Op0 = Src->getOperand(0);
+  SDValue Op1 = Src->getOperand(1);
+  SDValue Dest = N->getOperand(0);
+
+  if (!(isExtractElementZero(Op0) && isExtractElementZero(Op1)))
+    return SDValue();
+
+  if (Op0->getOperand(0) != Dest) {
+    if ((Opcode != ISD::FADD) && (Opcode != ISD::FMUL))
+      // Cannot commute the operands.
+      return SDValue();
+
+    if (Op1->getOperand(0) != Dest)
+      return SDValue();
+    
+    // We can still match the pattern by
+    // commuting the operands of the FADD/FMUL.
+    Op0 = Src->getOperand(1);
+    Op1 = Src->getOperand(0);
+  }
+
+  Op1 = Op1->getOperand(0);
+  if (Op1->getValueType(0) != VT)
+    return SDValue();
+
+  switch(Opcode) {
+    default : llvm_unreachable(0);
+    case ISD::FADD: Opcode = X86ISD::FADDS; break;
+    case ISD::FMUL: Opcode = X86ISD::FMULS; break;
+    case ISD::FSUB: Opcode = X86ISD::FSUBS; break;
+    case ISD::FDIV: Opcode = X86ISD::FDIVS; break;
+  }
+
+  return DAG.getNode(Opcode, SDLoc(N), VT, Dest, Op1);
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -19222,6 +19295,8 @@
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::INSERT_VECTOR_ELT: 
+    return PerformINSERT_VECTOR_ELTCombine(N, DAG, Subtarget);
   }
 
   return SDValue();
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td	(revision 196599)
+++ lib/Target/X86/X86InstrSSE.td	(working copy)
@@ -3017,6 +3017,45 @@
              basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
 }
 
+def : Pat<(v4f32 (X86fadds (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+          (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+def : Pat<(v4f32 (X86fsubs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+          (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+def : Pat<(v4f32 (X86fmuls (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+          (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+def : Pat<(v4f32 (X86fdivs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+          (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+  
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2f64 (X86fadds (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86fsubs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86fmuls (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86fdivs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
+let AddedComplexity = 20, Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86fadds (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86fsubs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86fmuls (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86fdivs (v4f32 VR128:$dst), (v4f32 VR128:$src))),
+            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v2f64 (X86fadds (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86fsubs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86fmuls (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86fdivs (v2f64 VR128:$dst), (v2f64 VR128:$src))),
+            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h	(revision 196599)
+++ lib/Target/X86/X86ISelLowering.h	(working copy)
@@ -292,6 +292,11 @@
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
+      // Scalar single/double precision floating point arithmetic operations.
+      // These operators work on vector types and are not commutative. They
+      // perform both the named operation and an implicit BLEND.
+      FADDS, FSUBS, FMULS, FDIVS,
+
       BLSI,   // BLSI - Extract lowest set isolated bit
       BLSMSK, // BLSMSK - Get mask up to lowest set bit
       BLSR,   // BLSR - Reset lowest set bit