[llvm] r217332 - [x86] Tweak the rules surrounding 0, 0 and 1, 1 v2f64 shuffles and add

Sun Sep 7 05:02:16 PDT 2014

Author: chandlerc
Date: Sun Sep  7 07:02:14 2014
New Revision: 217332

URL: http://llvm.org/viewvc/llvm-project?rev=217332&view=rev
Log:
[x86] Tweak the rules surrounding 0,0 and 1,1 v2f64 shuffles and add
support for MOVDDUP which is really important for matrix multiply style
operations that do lots of non-vector-aligned load and splats.

The original motivation was to add support for MOVDDUP as the lack of it
regresses matmul_f64_4x4 by 5% or so. However, all of the rules here
were somewhat suspicious.

First, we should always be using the floating point domain shuffles,
regardless of how many copies we have to make as a movapd is *crazy*
faster than the domain switching cost on some chips. (Mostly because
movapd is crazy cheap.) Because SHUFPD can't do the copy-for-free trick
of the PSHUF instructions, there is no need to avoid canonicalizing on
UNPCK variants, so do that canonicalizing. This also ensures we have the
chance to form MOVDDUP. =]

Second, we assume SSE2 support when doing any vector lowering, and given
that we should just use UNPCKLPD and UNPCKHPD as they can operate on
registers or memory. If vectors get spilled or come from memory at all
this is going to allow the load to be folded into the operation. If we
want to optimize for encoding size (the only difference, and only
a 2 byte difference) it should be done *much* later, likely after RA.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=217332&r1=217331&r2=217332&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Sep  7 07:02:14 2014
@@ -19320,26 +19320,42 @@ static bool combineX86ShuffleChain(SDVal
   // Use the float domain if the operand type is a floating point type.
   bool FloatDomain = VT.isFloatingPoint();
 
-  // If we don't have access to VEX encodings, the generic PSHUF instructions
-  // are preferable to some of the specialized forms despite requiring one more
-  // byte to encode because they can implicitly copy.
+  // For floating point shuffles, we don't have free copies in the shuffle
+  // instructions, so this always makes sense to canonicalize.
   //
-  // IF we *do* have VEX encodings, than we can use shorter, more specific
+  // For integer shuffles, if we don't have access to VEX encodings, the generic
+  // PSHUF instructions are preferable to some of the specialized forms despite
+  // requiring one more byte to encode because they can implicitly copy.
+  //
+  // IF we *do* have VEX encodings, then we can use shorter, more specific
   // shuffle instructions freely as they can copy due to the extra register
   // operand.
-  if (Subtarget->hasAVX()) {
+  if (FloatDomain || Subtarget->hasAVX()) {
     // We have both floating point and integer variants of shuffles that dup
     // either the low or high half of the vector.
     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
       bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
-                                     : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      unsigned Shuffle;
+      // If the input is a floating point, check if we have SSE3 which will let
+      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
+      // option to fold the input operand into even an unaligned memory load.
+      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+      } else {
+        // We model everything else using UNPCK instructions. While MOVLHPS and
+        // MOVHLPS are shorter encodings they cannot accept a memory operand
+        // which overly constrains subsequent lowering.
+        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      }
       if (Depth == 1 && Root->getOpcode() == Shuffle)
         return false; // Nothing to do!
-      MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
+      MVT ShuffleVT = FloatDomain ? MVT::v2f64 : MVT::v2i64;
       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
       DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
       DCI.AddToWorklist(Op.getNode());
       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                     /*AddTo*/ true);

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll?rev=217332&r1=217331&r2=217332&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll Sun Sep  7 07:02:14 2014
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x
 
 define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; CHECK-SSE2-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %shuffle
@@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2
 }
 define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
+; CHECK-SSE2:         unpckhpd {{.*}} # xmm0 = xmm0[1,1]
 ; CHECK-SSE2-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-;        of a mov?
-;
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x double> %shuffle
@@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2
 }
 define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2:         unpckhpd {{.*}} # xmm1 = xmm1[1,1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
   ret <2 x double> %shuffle
@@ -217,3 +217,31 @@ define <2 x i64> @shuffle_v2i64_31_copy(
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
   ret <2 x i64> %shuffle
 }
+
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE3:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE3-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE2:         movsd {{.*}}, %xmm0
+; CHECK-SSE2-NEXT:    unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE3:         movddup {{.*}}, %xmm0
+; CHECK-SSE3-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=217332&r1=217331&r2=217332&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Sun Sep  7 07:02:14 2014
@@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4
 define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_0001
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(
 ; AVX1-LABEL: @shuffle_v4f64_1000
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(
 ; AVX1-LABEL: @shuffle_v4f64_2200
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -152,7 +152,7 @@ define <4 x double> @shuffle_v4f64_3330(
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
-; AVX1-NEXT:    vmovhlps {{.*}} # xmm1 = xmm1[1,1]
+; AVX1-NEXT:    vunpckhpd {{.*}} # xmm1 = xmm1[1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>