[llvm-branch-commits] [llvm-branch] r243435 - Merging r243361:

Tue Jul 28 09:20:01 PDT 2015

Author: hans
Date: Tue Jul 28 11:20:00 2015
New Revision: 243435

URL: http://llvm.org/viewvc/llvm-project?rev=243435&view=rev
Log:
Merging r243361:
------------------------------------------------------------------------
r243361 | spatel | 2015-07-27 17:48:32 -0700 (Mon, 27 Jul 2015) | 17 lines

fix invalid load folding with SSE/AVX FP logical instructions (PR22371)

This is a follow-up to the FIXME that was added with D7474 ( http://reviews.llvm.org/rL229531 ).
I thought this load folding bug had been made hard-to-hit, but it turns out to be very easy
when targeting 32-bit x86 and causes a miscompile/crash in Wine:
https://bugs.winehq.org/show_bug.cgi?id=38826
https://llvm.org/bugs/show_bug.cgi?id=22371#c25

The quick fix is to simply remove the scalar FP logical instructions from the load folding table
in X86InstrInfo, but that causes us to miss load folds that should be possible when lowering fabs,
fneg, fcopysign. So the majority of this patch is altering those lowerings to use *vector* FP
logical instructions (because that's all x86 gives us anyway). That lets us do the load folding 
legally.

Differential Revision: http://reviews.llvm.org/D11477


------------------------------------------------------------------------

Modified:
    llvm/branches/release_37/   (props changed)
    llvm/branches/release_37/lib/Target/X86/X86ISelLowering.cpp
    llvm/branches/release_37/lib/Target/X86/X86InstrInfo.cpp
    llvm/branches/release_37/lib/Target/X86/X86InstrSSE.td
    llvm/branches/release_37/test/CodeGen/X86/pr2656.ll
    llvm/branches/release_37/test/CodeGen/X86/sse-fcopysign.ll
    llvm/branches/release_37/test/CodeGen/X86/vec_fabs.ll

Propchange: llvm/branches/release_37/
------------------------------------------------------------------------------

--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jul 28 11:20:00 2015
@@ -1,3 +1,3 @@
 /llvm/branches/Apple/Pertwee:110850,110961
 /llvm/branches/type-system-rewrite:133420-134817
-/llvm/trunk:155241,242236,242239,242281,242288,242296,242331,242341,242410,242412,242433-242434,242442,242543,242673,242680,242706,242721-242722,242733-242735,242742,242869,242919,242993,243001,243116,243263,243294
+/llvm/trunk:155241,242236,242239,242281,242288,242296,242331,242341,242410,242412,242433-242434,242442,242543,242673,242680,242706,242721-242722,242733-242735,242742,242869,242919,242993,243001,243116,243263,243294,243361

Modified: llvm/branches/release_37/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_37/lib/Target/X86/X86ISelLowering.cpp?rev=243435&r1=243434&r2=243435&view=diff
==============================================================================
--- llvm/branches/release_37/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/branches/release_37/lib/Target/X86/X86ISelLowering.cpp Tue Jul 28 11:20:00 2015
@@ -12640,24 +12640,29 @@ static SDValue LowerFABSorFNEG(SDValue O
       if (User->getOpcode() == ISD::FNEG)
         return Op;
 
-  SDValue Op0 = Op.getOperand(0);
-  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
-  // Assume scalar op for initialization; update for vector if needed.
-  // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
-  // generate a 16-byte vector constant and logic op even for the scalar case.
-  // Using a 16-byte mask allows folding the load of the mask with
-  // the logic op, so it can save (~4 bytes) on code size.
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
+
+  MVT LogicVT;
+  MVT EltVT;
+  unsigned NumElts;
+  
   if (VT.isVector()) {
+    LogicVT = VT;
     EltVT = VT.getVectorElementType();
     NumElts = VT.getVectorNumElements();
+  } else {
+    // There are no scalar bitwise logical SSE/AVX instructions, so we
+    // generate a 16-byte vector constant and logic op even for the scalar case.
+    // Using a 16-byte mask allows folding the load of the mask with
+    // the logic op, so it can save (~4 bytes) on code size.
+    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    EltVT = VT;
+    NumElts = (VT == MVT::f64) ? 2 : 4;
   }
 
   unsigned EltBits = EltVT.getSizeInBits();
@@ -12670,26 +12675,25 @@ static SDValue LowerFABSorFNEG(SDValue O
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, Alignment);
 
-  if (VT.isVector()) {
-    // For a vector, cast operands to a vector type, perform the logic op,
-    // and cast the result back to the original value type.
-    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
-    SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
-                              : DAG.getBitcast(VecVT, Op0);
-    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
-    return DAG.getBitcast(VT,
-                          DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
-  }
-
-  // If not vector, then scalar.
-  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+  unsigned LogicOp =
+    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
-  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
+
+  if (VT.isVector())
+    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+
+  // For the scalar case extend to a 128-bit vector, perform the logic op,
+  // and extract the scalar result back out.
+  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
+  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -12729,10 +12733,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op
   Constant *C = ConstantVector::get(CV);
   auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+
+  // Perform all logic operations as 16-byte vectors because there are no
+  // scalar FP logic instructions in SSE. This allows load folding of the
+  // constants into the logic instructions.
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+  SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
-  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
 
   // Next, clear the sign bit from the first operand (magnitude).
   // If it's a constant, we can clear it here.
@@ -12740,7 +12750,8 @@ static SDValue LowerFCOPYSIGN(SDValue Op
     APFloat APF = Op0CN->getValueAPF();
     // If the magnitude is a positive zero, the sign bit alone is enough.
     if (APF.isPosZero())
-      return SignBit;
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                         DAG.getIntPtrConstant(0, dl));
     APF.clearSign();
     CV[0] = ConstantFP::get(*Context, APF);
   } else {
@@ -12750,15 +12761,18 @@ static SDValue LowerFCOPYSIGN(SDValue Op
   }
   C = ConstantVector::get(CV);
   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
-  SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+  SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, false, 16);
   // If the magnitude operand wasn't a constant, we need to AND out the sign.
-  if (!isa<ConstantFPSDNode>(Op0))
-    Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
-
+  if (!isa<ConstantFPSDNode>(Op0)) {
+    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
+  }
   // OR the magnitude value with the sign bit.
-  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+  Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

Modified: llvm/branches/release_37/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_37/lib/Target/X86/X86InstrInfo.cpp?rev=243435&r1=243434&r2=243435&view=diff
==============================================================================
--- llvm/branches/release_37/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/branches/release_37/lib/Target/X86/X86InstrInfo.cpp Tue Jul 28 11:20:00 2015
@@ -956,18 +956,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
     { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
 
-    // FIXME: We should not be folding Fs* scalar loads into vector
-    // instructions because the vector instructions require vector-sized
-    // loads. Lowering should create vector-sized instructions (the Fv*
-    // variants below) to allow load folding.
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
-    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
-    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    // Do not fold Fs* scalar logical op loads because there are no scalar
+    // load variants for these instructions. When folded, the load is required
+    // to be 128-bits, so the load size would not match.
 
     { X86::FvANDNPDrr,      X86::FvANDNPDrm,    TB_ALIGN_16 },
     { X86::FvANDNPSrr,      X86::FvANDNPSrm,    TB_ALIGN_16 },

Modified: llvm/branches/release_37/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_37/lib/Target/X86/X86InstrSSE.td?rev=243435&r1=243434&r2=243435&view=diff
==============================================================================
--- llvm/branches/release_37/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/branches/release_37/lib/Target/X86/X86InstrSSE.td Tue Jul 28 11:20:00 2015
@@ -2919,6 +2919,14 @@ multiclass sse12_fp_packed_vector_logica
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
         VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>,
         PD, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+        VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>,
+        PS, VEX_4V, VEX_L;
+        
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>,
+        PD, VEX_4V, VEX_L;
   }
 
   let Constraints = "$src1 = $dst" in {

Modified: llvm/branches/release_37/test/CodeGen/X86/pr2656.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_37/test/CodeGen/X86/pr2656.ll?rev=243435&r1=243434&r2=243435&view=diff
==============================================================================
--- llvm/branches/release_37/test/CodeGen/X86/pr2656.ll (original)
+++ llvm/branches/release_37/test/CodeGen/X86/pr2656.ll Tue Jul 28 11:20:00 2015
@@ -1,15 +1,24 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 ; PR2656
 
-; CHECK:     {{xorps.*sp}}
-; CHECK-NOT: {{xorps.*sp}}
-
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9.4.0"
 	%struct.anon = type <{ float, float }>
 @.str = internal constant [17 x i8] c"pt: %.0f, %.0f\0A\00\00"		; <[17 x i8]*> [#uses=1]
 
+; We can not fold either stack load into an 'xor' instruction because that
+; would change what should be a 4-byte load into a 16-byte load.
+; We can fold the 16-byte constant load into either 'xor' instruction,
+; but we do not. It has more than one use, so it gets loaded into a register.
+
 define void @foo(%struct.anon* byval %p) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:         movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    xorps %xmm2, %xmm0
+; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
+; CHECK-NEXT:    xorps %xmm2, %xmm1
 entry:
 	%tmp = getelementptr %struct.anon, %struct.anon* %p, i32 0, i32 0		; <float*> [#uses=1]
 	%tmp1 = load float, float* %tmp		; <float> [#uses=1]
@@ -24,3 +33,20 @@ entry:
 }
 
 declare i32 @printf(...)
+
+; We can not fold the load from the stack into the 'and' instruction because
+; that changes an 8-byte load into a 16-byte load (illegal memory access).
+; We can fold the load of the constant because it is a 16-byte vector constant.
+
+define double @PR22371(double %x) {
+; CHECK-LABEL: PR22371:
+; CHECK:       movsd  16(%esp), %xmm0
+; CHECK-NEXT:  andpd  LCPI1_0, %xmm0
+; CHECK-NEXT:  movlpd  %xmm0, (%esp)
+  %call = tail call double @fabs(double %x) #0
+  ret double %call
+}
+
+declare double @fabs(double) #0
+attributes #0 = { readnone }
+

Modified: llvm/branches/release_37/test/CodeGen/X86/sse-fcopysign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_37/test/CodeGen/X86/sse-fcopysign.ll?rev=243435&r1=243434&r2=243435&view=diff
==============================================================================
--- llvm/branches/release_37/test/CodeGen/X86/sse-fcopysign.ll (original)
+++ llvm/branches/release_37/test/CodeGen/X86/sse-fcopysign.ll Tue Jul 28 11:20:00 2015
@@ -55,12 +55,12 @@ declare double @copysign(double, double)
 
 define float @int1(float %a, float %b) {
 ; X32-LABEL: @int1
-; X32:       movss 12(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:  movss  8(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  andps .LCPI2_0, %xmm1
-; X32-NEXT:  andps .LCPI2_1, %xmm0
-; X32-NEXT:  orps  %xmm1, %xmm0
-; X32-NEXT:  movss %xmm0, (%esp)
+; X32:       movss  8(%esp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:  andps .LCPI2_0, %xmm0
+; X32-NEXT:  movss 12(%esp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:  andps .LCPI2_1, %xmm1
+; X32-NEXT:  orps  %xmm0, %xmm1
+; X32-NEXT:  movss %xmm1, (%esp)
 ; X32-NEXT:  flds  (%esp)
 ; X32-NEXT:  popl %eax
 ; X32-NEXT:  retl
@@ -76,14 +76,14 @@ define float @int1(float %a, float %b) {
 
 define double @int2(double %a, float %b, float %c) {
 ; X32-LABEL: @int2
-; X32:       movsd  8(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:  movss 16(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:  addss 20(%ebp), %xmm1
-; X32-NEXT:  andpd .LCPI3_0, %xmm0
-; X32-NEXT:  cvtss2sd %xmm1, %xmm1
-; X32-NEXT:  andpd .LCPI3_1, %xmm1
-; X32-NEXT:  orpd  %xmm0, %xmm1
-; X32-NEXT:  movsd %xmm1, (%esp)
+; X32:       movss 16(%ebp), %xmm0 {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:  addss 20(%ebp), %xmm0
+; X32-NEXT:  movsd  8(%ebp), %xmm1 {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:  andpd .LCPI3_0, %xmm1
+; X32-NEXT:  cvtss2sd %xmm0, %xmm0
+; X32-NEXT:  andpd .LCPI3_1, %xmm0
+; X32-NEXT:  orpd  %xmm1, %xmm0
+; X32-NEXT:  movlpd %xmm0, (%esp)
 ; X32-NEXT:  fldl  (%esp)
 ; X32-NEXT:  movl %ebp, %esp
 ; X32-NEXT:  popl %ebp
@@ -91,9 +91,9 @@ define double @int2(double %a, float %b,
 ;
 ; X64-LABEL: @int2
 ; X64:       addss %xmm2, %xmm1
-; X64-NEXT:  andpd .LCPI3_0(%rip), %xmm0
 ; X64-NEXT:  cvtss2sd %xmm1, %xmm1
-; X64-NEXT:  andpd .LCPI3_1(%rip), %xmm1
+; X64-NEXT:  andpd .LCPI3_0(%rip), %xmm1
+; X64-NEXT:  andpd .LCPI3_1(%rip), %xmm0
 ; X64-NEXT:  orpd %xmm1, %xmm0
 ; X64-NEXT:  retq
   %tmp1 = fadd float %b, %c

Modified: llvm/branches/release_37/test/CodeGen/X86/vec_fabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_37/test/CodeGen/X86/vec_fabs.ll?rev=243435&r1=243434&r2=243435&view=diff
==============================================================================
--- llvm/branches/release_37/test/CodeGen/X86/vec_fabs.ll (original)
+++ llvm/branches/release_37/test/CodeGen/X86/vec_fabs.ll Tue Jul 28 11:20:00 2015
@@ -4,7 +4,7 @@
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {
   ; CHECK-LABEL: fabs_v2f64
-  ; CHECK: vandps
+  ; CHECK: vandpd
   %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
@@ -22,7 +22,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4
 define <4 x double> @fabs_v4f64(<4 x double> %p)
 {
   ; CHECK-LABEL: fabs_v4f64
-  ; CHECK: vandps
+  ; CHECK: vandpd
   %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }