[llvm] r226953 - [x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector

Fri Jan 23 14:44:17 PST 2015

Author: bruno
Date: Fri Jan 23 16:44:16 2015
New Revision: 226953

URL: http://llvm.org/viewvc/llvm-project?rev=226953&view=rev
Log:
[x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector

Handle the poor codegen for i64/x86xmm->v2i64 (%mm -> %xmm) moves. Instead of
using stack store/load pair to do the job, use scalar_to_vector directly, which
in the MMX case can use movq2dq. This was the current behavior prior to
improvements for vector legalization of extloads in r213897.

This commit fixes the regression and as a side-effect also remove some
unnecessary shuffles.

In the new attached testcase, we go from:

pshufw  $-18, (%rdi), %mm0
movq    %mm0, -8(%rsp)
movq    -8(%rsp), %xmm0
pshufd  $-44, %xmm0, %xmm0
movd    %xmm0, %eax
...

To:

pshufw  $-18, (%rdi), %mm0
movq2dq %mm0, %xmm0
movd    %xmm0, %eax
...

Differential Revision: http://reviews.llvm.org/D7126
rdar://problem/19413324

Added:
    llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll
    llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
    llvm/trunk/test/CodeGen/X86/widen_load-2.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=226953&r1=226952&r2=226953&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jan 23 16:44:16 2015
@@ -24757,6 +24757,8 @@ static SDValue PerformLOADCombine(SDNode
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
+  SDValue Ptr   = Ld->getBasePtr();
+  SDValue Chain = Ld->getChain();
   SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
@@ -24795,6 +24797,33 @@ static SDValue PerformLOADCombine(SDNode
     return DCI.CombineTo(N, NewVec, TF, true);
   }
 
+  // Conversion from x86mmx/i64 to v2i64 types is often done via stack
+  // store/load. Under certain conditions we can bypass the memory access and
+  // combine this load to use a scalar_to_vector instead. This leads to
+  // a reduction in the stack use, redundant emission of shuffles and create
+  // isel matching candidates for movq2dq instructions.
+  if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD &&
+      !Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) {
+
+    // If this load is directly stored, get the original source value.
+    StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
+    EVT SrcTy = PrevST->getValue().getValueType();
+    if (PrevST->getBasePtr() != Ptr ||
+        !(SrcTy == MVT::i64 || SrcTy == MVT::x86mmx))
+      return SDValue();
+    SDValue SrcVal = Chain.getOperand(1);
+
+    // On 32bit systems, we can't save 64bit integers, use f64 instead.
+    bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit();
+    if (Usef64)
+      SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal);
+    SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 : RegVT,
+                              SrcVal);
+
+    return DCI.CombineTo(N, Usef64 ?
+        DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain);
+  }
+
   return SDValue();
 }
 

Modified: llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll?rev=226953&r1=226952&r2=226953&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2012-01-18-vbitcast.ll Fri Jan 23 16:44:16 2015
@@ -1,14 +1,15 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s
 
-;CHECK-LABEL: vcast:
+; CHECK-LABEL: vcast:
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pmovzxdq
-;CHECK: pmovzxdq
+; CHECK-NOT: pmovzxdq
+; CHECK-NOT: pmovzxdq
+; CHECK: movdqa (%{{.*}}),  %[[R0:xmm[0-9]+]]
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
+; CHECK-NEXT: psubq (%{{.*}}), %[[R0]]
   %x = sub <2 x i32> %af, %bf
-;CHECK: psubq
+; CHECK: ret
   ret <2 x i32> %x
-;CHECK: ret
 }
 

Modified: llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/lower-bitcast.ll?rev=226953&r1=226952&r2=226953&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/lower-bitcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/lower-bitcast.ll Fri Jan 23 16:44:16 2015
@@ -68,12 +68,13 @@ define i64 @test4(i64 %A) {
   %2 = bitcast <2 x i32> %add to i64
   ret i64 %2
 }
-; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
+; FIXME: At the moment we still produce the sequence paddd+pshufd.
 ; Ideally, we should fold that sequence into a single paddd. This is fixed with
 ; the widening legalization.
 ;
 ; CHECK-LABEL: test4
-; CHECK: pshufd
+; CHECK: movd
+; CHECK-NOT: pshufd
 ; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK: ret

Added: llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll?rev=226953&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll (added)
+++ llvm/trunk/test/CodeGen/X86/mmx-movq2dq.ll Fri Jan 23 16:44:16 2015
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64
+
+; X86-32-LABEL: test0
+; X86-64-LABEL: test0
+define i32 @test0(<1 x i64>* %v4) {
+  %v5 = load <1 x i64>* %v4, align 8
+  %v12 = bitcast <1 x i64> %v5 to <4 x i16>
+  %v13 = bitcast <4 x i16> %v12 to x86_mmx
+  ; X86-32: pshufw  $238
+  ; X86-32-NOT: movq
+  ; X86-32-NOT: movsd
+  ; X86-32: movq2dq
+  ; X86-64: pshufw  $238
+  ; X86-64-NOT: movq
+  ; X86-64-NOT: pshufd
+  ; X86-64: movq2dq
+  ; X86-64-NEXT: movd
+  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
+  %v15 = bitcast x86_mmx %v14 to <4 x i16>
+  %v16 = bitcast <4 x i16> %v15 to <1 x i64>
+  %v17 = extractelement <1 x i64> %v16, i32 0
+  %v18 = bitcast i64 %v17 to <2 x i32>
+  %v19 = extractelement <2 x i32> %v18, i32 0
+  %v20 = add i32 %v19, 32
+  ret i32 %v20
+}
+
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)

Modified: llvm/trunk/test/CodeGen/X86/widen_load-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_load-2.ll?rev=226953&r1=226952&r2=226953&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_load-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll Fri Jan 23 16:44:16 2015
@@ -78,8 +78,7 @@ define void @add3i16(%i16vec3* nocapture
 ; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
 ; CHECK-NEXT:    pextrw   $4, %[[R1]], 4(%{{.*}})
 ; CHECK-NEXT:    pshufb   {{.*}}, %[[R1]]
-; CHECK-NEXT:    pmovzxdq %[[R1]], %[[R0]]
-; CHECK-NEXT:    movd     %[[R0]], (%{{.*}})
+; CHECK-NEXT:    movd     %[[R1]], (%{{.*}})
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
 	%x = add %i16vec3 %a, %b