[llvm] r214954 - [x86] Fix two independent miscompiles in the process of getting the same

Wed Aug 6 03:16:36 PDT 2014

Author: chandlerc
Date: Wed Aug  6 05:16:36 2014
New Revision: 214954

URL: http://llvm.org/viewvc/llvm-project?rev=214954&view=rev
Log:
[x86] Fix two independent miscompiles in the process of getting the same
test case to actually generate correct code.

The primary miscompile fixed here is that we weren't correctly handling
in-place elements in one half of a single-input v8i16 shuffle when
moving a dword of elements from that half to the other half. Some times,
we would clobber the in-place elements in forming the dword to move
across halves.

The fix to this involves forcibly marking the in-place inputs even when
there is no need to gather them into a dword, and to much more carefully
re-arrange the elements when grouping them into a dword to move across
halves. With these two changes we would generate correct shuffles for
the test case, but found another miscompile. There are also some random
perturbations of the generated shuffle pattern in SSE2. It looks like
a wash; more instructions in some cases fewer in others.

The second miscompile would corrupt the results into nonsense. This is
a buggy pattern in one of the added DAG combines. Mapping elements
through a PSHUFD when pairing redundant half-shuffles is *much* harder
than this code makes it out to be -- it requires reasoning about *all*
of where the input is used in the PSHUFD, not just one part of where it
is used. Plus, we can't combine a half shuffle *into* a PSHUFD but the
code didn't guard against it. I think this was just a bad idea and I've
just removed that aspect of the combine. No tests regress as
a consequence so seems OK.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=214954&r1=214953&r2=214954&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Aug  6 05:16:36 2014
@@ -7386,9 +7386,10 @@ static SDValue lowerV8I16SingleInputVect
   // First fix the masks for all the inputs that are staying in their
   // original halves. This will then dictate the targets of the cross-half
   // shuffles.
-  auto fixInPlaceInputs = [&PSHUFDMask](
-      ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
-      MutableArrayRef<int> HalfMask, int HalfOffset) {
+  auto fixInPlaceInputs =
+      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+                    MutableArrayRef<int> SourceHalfMask,
+                    MutableArrayRef<int> HalfMask, int HalfOffset) {
     if (InPlaceInputs.empty())
       return;
     if (InPlaceInputs.size() == 1) {
@@ -7397,6 +7398,14 @@ static SDValue lowerV8I16SingleInputVect
       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
       return;
     }
+    if (IncomingInputs.empty()) {
+      // Just fix all of the in place inputs.
+      for (int Input : InPlaceInputs) {
+        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+        PSHUFDMask[Input / 2] = Input / 2;
+      }
+      return;
+    }
 
     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
@@ -7408,10 +7417,8 @@ static SDValue lowerV8I16SingleInputVect
     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   };
-  if (!HToLInputs.empty())
-    fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
-  if (!LToHInputs.empty())
-    fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
 
   // Now gather the cross-half inputs and place them into a free dword of
   // their target half.
@@ -7420,7 +7427,8 @@ static SDValue lowerV8I16SingleInputVect
   auto moveInputsToRightHalf = [&PSHUFDMask](
       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
-      int SourceOffset, int DestOffset) {
+      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+      int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
     };
@@ -7498,18 +7506,68 @@ static SDValue lowerV8I16SingleInputVect
     } else if (IncomingInputs.size() == 2) {
       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
-        int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
-        assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
-               "Not all dwords can be clobbered!");
-        SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
-        SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+        // We have two non-adjacent or clobbered inputs we need to extract from
+        // the source half. To do this, we need to map them into some adjacent
+        // dword slot in the source mask.
+        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+                              IncomingInputs[1] - SourceOffset};
+
+        // If there is a free slot in the source half mask adjacent to one of
+        // the inputs, place the other input in it. We use (Index XOR 1) to
+        // compute an adjacent index.
+        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+            SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+                   SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+          InputsFixed[0] = InputsFixed[1] ^ 1;
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+          // The two inputs are in the same DWord but it is clobbered and the
+          // adjacent DWord isn't used at all. Move both inputs to the free
+          // slot.
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+        } else {
+          // The only way we hit this point is if there is no clobbering
+          // (because there are no off-half inputs to this half) and there is no
+          // free slot adjacent to one of the inputs. In this case, we have to
+          // swap an input with a non-input.
+          for (int i = 0; i < 4; ++i)
+            assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+                   "We can't handle any clobbers here!");
+          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+                 "Cannot have adjacent inputs here!");
+
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+          // We also have to update the final source mask in this case because
+          // it may need to undo the above swap.
+          for (int &M : FinalSourceHalfMask)
+            if (M == (InputsFixed[0] ^ 1))
+              M = InputsFixed[1];
+            else if (M == InputsFixed[1])
+              M = InputsFixed[0] ^ 1;
+
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        }
+
+        // Point everything at the fixed inputs.
         for (int &M : HalfMask)
           if (M == IncomingInputs[0])
-            M = SourceDWordBase + SourceOffset;
+            M = InputsFixed[0] + SourceOffset;
           else if (M == IncomingInputs[1])
-            M = SourceDWordBase + 1 + SourceOffset;
-        IncomingInputs[0] = SourceDWordBase + SourceOffset;
-        IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+            M = InputsFixed[1] + SourceOffset;
+
+        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
       }
     } else {
       llvm_unreachable("Unhandled input size!");
@@ -7524,9 +7582,9 @@ static SDValue lowerV8I16SingleInputVect
         if (M == Input)
           M = FreeDWord * 2 + Input % 2;
   };
-  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
-  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
 
   // Now enact all the shuffles we've computed to move the inputs into their
@@ -19391,26 +19449,6 @@ static bool combineRedundantHalfShuffle(
 
       // Other-half shuffles are no-ops.
       continue;
-
-    case X86ISD::PSHUFD: {
-      // We can only handle pshufd if the half we are combining either stays in
-      // its half, or switches to the other half. Bail if one of these isn't
-      // true.
-      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
-      int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
-      if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
-            (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
-        return false;
-
-      // Map the mask through the pshufd and keep walking up the chain.
-      for (int i = 0; i < 4; ++i)
-        Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
-
-      // Switch halves if the pshufd does.
-      CombineOpcode =
-          VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
-      continue;
-    }
     }
     // Break out of the loop if we break out of the switch.
     break;

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=214954&r1=214953&r2=214954&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Wed Aug  6 05:16:36 2014
@@ -173,9 +173,9 @@ define <8 x i16> @shuffle_v8i16_26405173
 ; SSE2-LABEL: @shuffle_v8i16_26405173
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
 ; SSE2-NEXT:    retq
 ;
@@ -190,9 +190,9 @@ define <8 x i16> @shuffle_v8i16_20645173
 ; SSE2-LABEL: @shuffle_v8i16_20645173
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
 ; SSE2-NEXT:    retq
 ;
@@ -207,9 +207,9 @@ define <8 x i16> @shuffle_v8i16_26401375
 ; SSE2-LABEL: @shuffle_v8i16_26401375
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,3,0,4,5,6,7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_26401375
@@ -404,9 +404,8 @@ define <8 x i16> @shuffle_v8i16_45630127
 ; SSE2-LABEL: @shuffle_v8i16_45630127
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
-; SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
+; SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,3,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_45630127
@@ -652,8 +651,9 @@ define <8 x i16> @shuffle_v8i16_012dcde3
 define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @shuffle_v8i16_XXX1X579
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,7,5,6,7]
 ; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[0,1,3,2,4,5,6,7]
 ; SSE2-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,1,2,1]
@@ -663,36 +663,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579
 ;
 ; SSSE3-LABEL: @shuffle_v8i16_XXX1X579
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[{{[0-9]+,[0-9]+}},2,3,10,11,14,15,{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}}]
 ; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
+; SSSE3-NEXT:    pshufb {{.*}}    # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7]
 ; SSSE3-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
-; FIXME-SSE2-LABEL: @shuffle_v8i16_XX4X8acX
-; FIXME-SSE2:       # BB#0:
-; FIXME-SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
-; FIXME-SSE2-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; FIXME-SSE2-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
-; FIXME-SSE2-NEXT:    punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; FIXME-SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,1,0,2,4,5,6,7]
-; FIXME-SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,1,2,1]
-; FIXME-SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; FIXME-SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,6,7,4,7]
-; FIXME-SSE2-NEXT:    retq
-;
-; FIXME-SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
-; FIXME-SSSE3:       # BB#0:
-; FIXME-SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
-; FIXME-SSSE3-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,5,7,6,7]
-; FIXME-SSSE3-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
-; FIXME-SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; FIXME-SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},0,1,{{[0-9]+,[0-9]+}},2,3,6,7,10,11{{[0-9]+,[0-9]+}}]
-; FIXME-SSSE3-NEXT:    retq
+; SSE2-LABEL: @shuffle_v8i16_XX4X8acX
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*}} # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*}}   # xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*}}   # xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: @shuffle_v8i16_XX4X8acX
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*}}    # [[X:xmm[0-9]+]] = xmm0[2,1,2,3]
+; SSSE3-NEXT:    pshuflw {{.*}}   # xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*}}    # xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*}} # xmm0 = xmm0[0],[[X]][0],xmm0[1],[[X]][1],xmm0[2],[[X]][2],xmm0[3],[[X]][3]
+; SSSE3-NEXT:    pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+}},2,3,{{[0-9]+,[0-9]+}},0,1,4,5,8,9,{{[0-9]+,[0-9]+}}]
+; SSSE3-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
   ret <8 x i16> %shuffle
 }