[llvm] r218143 - [x86] Fully generalize the zext lowering in the new vector shuffle

Chandler Carruth chandlerc at gmail.com
Fri Sep 19 13:00:32 PDT 2014


Author: chandlerc
Date: Fri Sep 19 15:00:32 2014
New Revision: 218143

URL: http://llvm.org/viewvc/llvm-project?rev=218143&view=rev
Log:
[x86] Fully generalize the zext lowering in the new vector shuffle
lowering to support both anyext and zext and to custom lower for many
different microarchitectures.

Using this allows us to get *exactly* the right code for zext and anyext
shuffles in all the vector sizes. For v16i8, the improvement is *huge*.
The new SSE2 test case added I refused to add before this because it was
sooooo muny instructions.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
    llvm/trunk/test/CodeGen/X86/widen_conversions.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=218143&r1=218142&r2=218143&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 19 15:00:32 2014
@@ -7393,16 +7393,21 @@ static SmallBitVector computeZeroableShu
   return Zeroable;
 }
 
-/// \brief Try to lower a vector shuffle as a zero extension.
+/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
 ///
-/// This tries to use the SSE4.1 PMOVZX instruction family to lower a vector
-/// shuffle throuh a zero extension. It doesn't check for the availability or
-/// profitability of this lowering though, it tries to aggressively match this
-/// pattern. It handles both blends with all-zero inputs to explicitly
-/// zero-extend and undef-lanes (sometimes undef due to masking out later).
-static SDValue lowerVectorShuffleAsZeroExtend(SDLoc DL, MVT VT, SDValue V1,
-                                              SDValue V2, ArrayRef<int> Mask,
-                                              SelectionDAG &DAG) {
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering,  it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
   int Bits = VT.getSizeInBits();
@@ -7413,6 +7418,7 @@ static SDValue lowerVectorShuffleAsZeroE
   // if valid.
   auto LowerWithStride = [&](int Stride) -> SDValue {
     SDValue InputV;
+    bool AnyExt = true;
     for (int i = 0; i < NumElements; ++i) {
       if (Mask[i] == -1)
         continue; // Valid anywhere but doesn't tell us anything.
@@ -7420,8 +7426,10 @@ static SDValue lowerVectorShuffleAsZeroE
         // Each of the extend elements needs to be zeroable.
         if (!Zeroable[i])
           return SDValue();
-        else
-          continue;
+
+        // We no lorger are in the anyext case.
+        AnyExt = false;
+        continue;
       }
 
       // Each of the base elements needs to be consecutive indices into the
@@ -7442,15 +7450,68 @@ static SDValue lowerVectorShuffleAsZeroE
     if (!InputV)
       return SDValue();
 
-    // Found a valid lowering! Compute all the types and the operation. We force
-    // everything to integer types here as that's the only way zext makes sense.
-    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
-    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride),
-                                 NumElements / Stride);
-
-    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+    // Found a valid zext mask! Try various lowering strategies based on the
+    // input type and available ISA extensions.
+    if (Subtarget->hasSSE41()) {
+      MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+      MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Stride),
+                                   NumElements / Stride);
+      InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+      return DAG.getNode(ISD::BITCAST, DL, VT,
+                         DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+    }
+
+    // For any extends we can cheat for larger element sizes and use shuffle
+    // instructions that can fold with a load and/or copy.
+    if (AnyExt && EltBits == 32) {
+      int PSHUFDMask[4] = {0, -1, 1, -1};
+      return DAG.getNode(
+          ISD::BITCAST, DL, VT,
+          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                      DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+    }
+    if (AnyExt && EltBits == 16 && Stride > 2) {
+      int PSHUFDMask[4] = {0, -1, 0, -1};
+      InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                           DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+                           getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
+      int PSHUFHWMask[4] = {1, -1, -1, -1};
+      return DAG.getNode(
+          ISD::BITCAST, DL, VT,
+          DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+                      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
+                      getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
+    }
+
+    // If this would require more than 2 unpack instructions to expand, use
+    // pshufb when available. We can only use more than 2 unpack instructions
+    // when zero extending i8 elements which also makes it easier to use pshufb.
+    if (Stride > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+      assert(NumElements == 16 && "Unexpected byte vector width!");
+      SDValue PSHUFBMask[16];
+      for (int i = 0; i < 16; ++i)
+        PSHUFBMask[i] =
+            DAG.getConstant((i % Stride == 0) ? i / Stride : 0x80, MVT::i8);
+      InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
+      return DAG.getNode(ISD::BITCAST, DL, VT,
+                         DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                                     DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                 MVT::v16i8, PSHUFBMask)));
+    }
+
+    // Otherwise emit a sequence of unpacks.
+    do {
+      MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+      SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+                           : getZeroVector(InputVT, Subtarget, DAG, DL);
+      InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+      InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+      Stride /= 2;
+      EltBits *= 2;
+      NumElements /= 2;
+    } while (Stride > 1);
+    return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
   };
 
   // The widest stride possible for zero extending is to a 64-bit integer.
@@ -7469,7 +7530,7 @@ static SDValue lowerVectorShuffleAsZeroE
       return V;
   }
 
-  // No viable zext lowering found.
+  // No viable ext lowering found.
   return SDValue();
 }
 
@@ -7843,10 +7904,9 @@ static SDValue lowerV4I32VectorShuffle(S
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative.
-  if (Subtarget->hasSSE41())
-    if (SDValue ZExt =
-            lowerVectorShuffleAsZeroExtend(DL, MVT::v4i32, V1, V2, Mask, DAG))
-      return ZExt;
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
@@ -8519,10 +8579,9 @@ static SDValue lowerV8I16VectorShuffle(S
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative.
-  if (Subtarget->hasSSE41())
-    if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v8i16, V1, V2,
-                                                      OrigMask, DAG))
-      return ZExt;
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+    return ZExt;
 
   auto isV1 = [](int M) { return M >= 0 && M < 8; };
   auto isV2 = [](int M) { return M >= 8; };
@@ -8688,10 +8747,9 @@ static SDValue lowerV16I8VectorShuffle(S
       return Rotate;
 
   // Try to use a zext lowering.
-  if (Subtarget->hasSSE41())
-    if (SDValue ZExt = lowerVectorShuffleAsZeroExtend(DL, MVT::v16i8, V1, V2,
-                                                      OrigMask, DAG))
-      return ZExt;
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+    return ZExt;
 
   int MaskStorage[16] = {
       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll?rev=218143&r1=218142&r2=218143&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll Fri Sep 19 15:00:32 2014
@@ -515,13 +515,13 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu
 ; SSE2-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0,0,1,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},1,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu
@@ -533,15 +533,17 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu
 }
 
 define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSE2-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
+; SSE2-NEXT:    retq
+;
 ; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSSE3-NEXT:    pxor %[[X2:xmm[0-9]+]], %[[X2]]
-; SSSE3-NEXT:    pshufb {{.*}}    # [[X2]] = zero,[[X2]][2,4,6],zero,[[X2]][10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    por %xmm0, %[[X2]]
-; SSSE3-NEXT:    punpcklbw {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3],[[X2]][4],[[X1]][4],[[X2]][5],[[X1]][5],[[X2]][6],[[X1]][6],[[X2]][7],[[X1]][7]
-; SSSE3-NEXT:    movdqa %[[X2]], %xmm0
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz
@@ -576,16 +578,16 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu
 define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
 ; SSE2-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSE2-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
-; SSE2-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSE2-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3],xmm0[4],[[X]][4],xmm0[5],[[X]][5],xmm0[6],[[X]][6],xmm0[7],[[X]][7]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
 ; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
-; SSSE3-NEXT:    punpcklbw {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3],xmm0[4],[[X1]][4],xmm0[5],[[X1]][5],xmm0[6],[[X1]][6],xmm0[7],[[X1]][7]
+; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=218143&r1=218142&r2=218143&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Fri Sep 19 15:00:32 2014
@@ -778,23 +778,20 @@ define <4 x i32> @shuffle_v4i32_0u1u(<4
 define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
 ; SSE2-LABEL: @shuffle_v4i32_0z1z
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3]
-; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: @shuffle_v4i32_0z1z
 ; SSE3:       # BB#0:
-; SSE3-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
-; SSE3-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3]
-; SSE3-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE3-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSE3-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v4i32_0z1z
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    xorps %[[X:xmm[0-9]+]], %[[X]]
-; SSSE3-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],[[X]][1,3]
-; SSSE3-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    pxor %[[X:xmm[0-9]+]], %[[X]]
+; SSSE3-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: @shuffle_v4i32_0z1z

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=218143&r1=218142&r2=218143&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Fri Sep 19 15:00:32 2014
@@ -1007,23 +1007,15 @@ define <8 x i16> @shuffle_v8i16_0zzz1zzz
 ; SSE2-LABEL: @shuffle_v8i16_0zzz1zzz
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSE2-NEXT:    pxor %[[X2:xmm[0-9]+]], %[[X2]]
-; SSE2-NEXT:    punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
-; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = [[X2]][0,3,2,1]
-; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[1,2,3,0,4,5,6,7]
 ; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
+; SSE2-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_0zzz1zzz
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    pxor %[[X1:xmm[0-9]+]], %[[X1]]
-; SSSE3-NEXT:    pxor %[[X2:xmm[0-9]+]], %[[X2]]
-; SSSE3-NEXT:    punpcklwd {{.*}} # [[X2]] = [[X2]][0],xmm0[0],[[X2]][1],xmm0[1],[[X2]][2],xmm0[2],[[X2]][3],xmm0[3]
-; SSSE3-NEXT:    pshufb {{.*}} # [[X2]] = [[X2]][2,3,8,9,6,7,0,1,8,9,6,7,4,5,6,7]
-; SSSE3-NEXT:    punpcklwd {{.*}} # [[X2]] = [[X2]][0],[[X1]][0],[[X2]][1],[[X1]][1],[[X2]][2],[[X1]][2],[[X2]][3],[[X1]][3]
-; SSSE3-NEXT:    movdqa %[[X2]], %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1],xmm0[2],[[X1]][2],xmm0[3],[[X1]][3]
+; SSSE3-NEXT:    punpckldq {{.*}} # xmm0 = xmm0[0],[[X1]][0],xmm0[1],[[X1]][1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: @shuffle_v8i16_0zzz1zzz

Modified: llvm/trunk/test/CodeGen/X86/widen_conversions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_conversions.ll?rev=218143&r1=218142&r2=218143&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_conversions.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_conversions.ll Fri Sep 19 15:00:32 2014
@@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4
 ; CHECK:      movd (%{{.*}}), %[[X:xmm[0-9]+]]
 ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
 ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
-; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: punpcklwd %[[Z]], %[[X]]
 ; CHECK-NEXT: ret
 
   %val = load <4 x i8>* %ptr





More information about the llvm-commits mailing list