[llvm-commits] [llvm] r65588 - in /llvm/branches/Apple/Dib: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h lib/Target/X86/X86InstrSSE.td test/CodeGen/X86/vec_shuffle-12.ll test/CodeGen/X86/vec_shuffle-13.ll test/CodeGen/X86/vec_shuffle-2.ll test/CodeGen/X86/vec_shuffle-21.ll test/CodeGen/X86/vec_shuffle-28.ll test/CodeGen/X86/vec_shuffle-29.ll

Thu Feb 26 15:39:22 PST 2009

Author: void
Date: Thu Feb 26 17:39:22 2009
New Revision: 65588

URL: http://llvm.org/viewvc/llvm-project?rev=65588&view=rev
Log:
Merge r65311 into Dib:

Generate better code for v8i16 shuffles on SSE2
Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
  cleanups.
New tests that test the above.

Examples:

New:
_shuf2:
        pextrw  $7, %xmm0, %eax
        punpcklqdq  %xmm1, %xmm0
        pshuflw     $128, %xmm0, %xmm0
        pinsrw      $2, %eax, %xmm0

Old:
_shuf2:
        pextrw  $2, %xmm0, %eax
        pextrw  $7, %xmm0, %ecx
        pinsrw  $2, %ecx, %xmm0
        pinsrw  $3, %eax, %xmm0
        movd    %xmm1, %eax
        pinsrw  $4, %eax, %xmm0
        ret

=========

New:
_shuf4:
        punpcklqdq      %xmm1, %xmm0
        pshufb          LCPI1_0, %xmm0

Old:
_shuf4:
        pextrw  $3, %xmm0, %eax
        movsd   %xmm1, %xmm0
        pextrw  $3, %xmm1, %ecx
        pinsrw  $4, %ecx, %xmm0
        pinsrw  $5, %eax, %xmm0

========

New:
_shuf1:
        pushl   %ebx
        pushl   %edi
        pushl   %esi
        pextrw  $1, %xmm0, %eax
        rolw    $8, %ax
        movd    %xmm0, %ecx
        rolw    $8, %cx
        pextrw  $5, %xmm0, %edx
        pextrw  $4, %xmm0, %esi
        pextrw  $3, %xmm0, %edi
        pextrw  $2, %xmm0, %ebx
        movaps  %xmm0, %xmm1
        pinsrw  $0, %ecx, %xmm1
        pinsrw  $1, %eax, %xmm1
        rolw    $8, %bx
        pinsrw  $2, %ebx, %xmm1
        rolw    $8, %di
        pinsrw  $3, %edi, %xmm1
        rolw    $8, %si
        pinsrw  $4, %esi, %xmm1
        rolw    $8, %dx
        pinsrw  $5, %edx, %xmm1
        pextrw  $7, %xmm0, %eax
        rolw    $8, %ax
        movaps  %xmm1, %xmm0
        pinsrw  $7, %eax, %xmm0
        popl    %esi
        popl    %edi
        popl    %ebx
        ret

Old:
_shuf1:
        subl    $252, %esp
        movaps  %xmm0, (%esp)
        movaps  %xmm0, 16(%esp)
        movaps  %xmm0, 32(%esp)
        movaps  %xmm0, 48(%esp)
        movaps  %xmm0, 64(%esp)
        movaps  %xmm0, 80(%esp)
        movaps  %xmm0, 96(%esp)
        movaps  %xmm0, 224(%esp)
        movaps  %xmm0, 208(%esp)
        movaps  %xmm0, 192(%esp)
        movaps  %xmm0, 176(%esp)
        movaps  %xmm0, 160(%esp)
        movaps  %xmm0, 144(%esp)
        movaps  %xmm0, 128(%esp)
        movaps  %xmm0, 112(%esp)
        movzbl  14(%esp), %eax
        movd    %eax, %xmm1
        movzbl  22(%esp), %eax
        movd    %eax, %xmm2
        punpcklbw     %xmm1, %xmm2
        movzbl        42(%esp), %eax
        movd          %eax, %xmm1
        movzbl        50(%esp), %eax
        movd          %eax, %xmm3
        punpcklbw     %xmm1, %xmm3
        punpcklbw     %xmm2, %xmm3
        movzbl        77(%esp), %eax
        movd          %eax, %xmm1
        movzbl        84(%esp), %eax
        movd          %eax, %xmm2
        punpcklbw     %xmm1, %xmm2
        movzbl        104(%esp), %eax
        movd          %eax, %xmm1
        punpcklbw     %xmm1, %xmm0
        punpcklbw     %xmm2, %xmm0
        movaps        %xmm0, %xmm1
        punpcklbw     %xmm3, %xmm1
        movzbl        127(%esp), %eax
        movd          %eax, %xmm0
        movzbl        135(%esp), %eax
        movd          %eax, %xmm2
        punpcklbw     %xmm0, %xmm2
        movzbl        155(%esp), %eax
        movd          %eax, %xmm0
        movzbl        163(%esp), %eax
        movd          %eax, %xmm3
        punpcklbw     %xmm0, %xmm3
        punpcklbw     %xmm2, %xmm3
        movzbl        188(%esp), %eax
        movd          %eax, %xmm0
        movzbl        197(%esp), %eax
        movd          %eax, %xmm2
        punpcklbw     %xmm0, %xmm2
        movzbl        217(%esp), %eax
        movd          %eax, %xmm4
        movzbl        225(%esp), %eax
        movd          %eax, %xmm0
        punpcklbw     %xmm4, %xmm0
        punpcklbw     %xmm2, %xmm0
        punpcklbw     %xmm3, %xmm0
        punpcklbw     %xmm1, %xmm0
        addl          $252, %esp
        ret

Modified:
    llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.cpp
    llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.h
    llvm/branches/Apple/Dib/lib/Target/X86/X86InstrSSE.td
    llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-12.ll
    llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-13.ll
    llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-2.ll
    llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-21.ll
    llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-28.ll
    llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-29.ll

Modified: llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.cpp?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================

--- llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.cpp Thu Feb 26 17:39:22 2009
@@ -2719,38 +2719,6 @@
   return Mask;
 }
 
-/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
-/// specifies a 8 element shuffle that can be broken into a pair of
-/// PSHUFHW and PSHUFLW.
-static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
-  assert(N->getOpcode() == ISD::BUILD_VECTOR);
-
-  if (N->getNumOperands() != 8)
-    return false;
-
-  // Lower quadword shuffled.
-  for (unsigned i = 0; i != 4; ++i) {
-    SDValue Arg = N->getOperand(i);
-    if (Arg.getOpcode() == ISD::UNDEF) continue;
-    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
-    if (Val >= 4)
-      return false;
-  }
-
-  // Upper quadword shuffled.
-  for (unsigned i = 4; i != 8; ++i) {
-    SDValue Arg = N->getOperand(i);
-    if (Arg.getOpcode() == ISD::UNDEF) continue;
-    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
-    if (Val < 4 || Val > 7)
-      return false;
-  }
-
-  return true;
-}
-
 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as
 /// values in ther permute mask.
 static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1,
@@ -3573,264 +3541,389 @@
   return SDValue();
 }
 
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all]   pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
 static
 SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
                                  SDValue PermMask, SelectionDAG &DAG,
-                                 TargetLowering &TLI, DebugLoc dl) {
-  SDValue NewV;
-  MVT MaskVT = MVT::getIntVectorWithNumElements(8);
-  MVT MaskEVT = MaskVT.getVectorElementType();
-  MVT PtrVT = TLI.getPointerTy();
+                                 X86TargetLowering &TLI, DebugLoc dl) {
   SmallVector<SDValue, 8> MaskElts(PermMask.getNode()->op_begin(),
                                    PermMask.getNode()->op_end());
+  SmallVector<int, 8> MaskVals;
 
-  // First record which half of which vector the low elements come from.
-  SmallVector<unsigned, 4> LowQuad(4);
-  for (unsigned i = 0; i < 4; ++i) {
+  // Determine if more than 1 of the words in each of the low and high quadwords
+  // of the result come from the same quadword of one of the two inputs.  Undef
+  // mask values count as coming from any quadword, for better codegen.
+  SmallVector<unsigned, 4> LoQuad(4);
+  SmallVector<unsigned, 4> HiQuad(4);
+  BitVector InputQuads(4);
+  for (unsigned i = 0; i < 8; ++i) {
+    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
     SDValue Elt = MaskElts[i];
-    if (Elt.getOpcode() == ISD::UNDEF)
+    int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : 
+                 cast<ConstantSDNode>(Elt)->getZExtValue();
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0) {
+      ++Quad[0];
+      ++Quad[1];
+      ++Quad[2];
+      ++Quad[3];
       continue;
-    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-    int QuadIdx = EltIdx / 4;
-    ++LowQuad[QuadIdx];
+    }
+    ++Quad[EltIdx / 4];
+    InputQuads.set(EltIdx / 4);
   }
 
-  int BestLowQuad = -1;
+  int BestLoQuad = -1;
   unsigned MaxQuad = 1;
   for (unsigned i = 0; i < 4; ++i) {
-    if (LowQuad[i] > MaxQuad) {
-      BestLowQuad = i;
-      MaxQuad = LowQuad[i];
+    if (LoQuad[i] > MaxQuad) {
+      BestLoQuad = i;
+      MaxQuad = LoQuad[i];
     }
   }
 
-  // Record which half of which vector the high elements come from.
-  SmallVector<unsigned, 4> HighQuad(4);
-  for (unsigned i = 4; i < 8; ++i) {
-    SDValue Elt = MaskElts[i];
-    if (Elt.getOpcode() == ISD::UNDEF)
-      continue;
-    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-    int QuadIdx = EltIdx / 4;
-    ++HighQuad[QuadIdx];
-  }
-
-  int BestHighQuad = -1;
+  int BestHiQuad = -1;
   MaxQuad = 1;
   for (unsigned i = 0; i < 4; ++i) {
-    if (HighQuad[i] > MaxQuad) {
-      BestHighQuad = i;
-      MaxQuad = HighQuad[i];
+    if (HiQuad[i] > MaxQuad) {
+      BestHiQuad = i;
+      MaxQuad = HiQuad[i];
     }
   }
 
-  // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it.
-  if (BestLowQuad != -1 || BestHighQuad != -1) {
-    // First sort the 4 chunks in order using shufpd.
-    SmallVector<SDValue, 8> MaskVec;
-
-    if (BestLowQuad != -1)
-      MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32));
-    else
-      MaskVec.push_back(DAG.getConstant(0, MVT::i32));
-
-    if (BestHighQuad != -1)
-      MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32));
-    else
-      MaskVec.push_back(DAG.getConstant(1, MVT::i32));
+  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+  // of the two input vectors, shuffle them into one input vector so only a 
+  // single pshufb instruction is necessary. If There are more than 2 input
+  // quads, disable the next transformation since it does not help SSSE3.
+  bool V1Used = InputQuads[0] || InputQuads[1];
+  bool V2Used = InputQuads[2] || InputQuads[3];
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    if (InputQuads.count() == 2 && V1Used && V2Used) {
+      BestLoQuad = InputQuads.find_first();
+      BestHiQuad = InputQuads.find_next(BestLoQuad);
+    }
+    if (InputQuads.count() > 2) {
+      BestLoQuad = -1;
+      BestHiQuad = -1;
+    }
+  }
 
-    SDValue Mask= DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, &MaskVec[0],2);
+  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+  // the shuffle mask.  If a quad is scored as -1, that means that it contains
+  // words from all 4 input quadwords.
+  SDValue NewV;
+  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+    SmallVector<SDValue,8> MaskV;
+    MaskV.push_back(DAG.getConstant(BestLoQuad < 0 ? 0 : BestLoQuad, MVT::i64));
+    MaskV.push_back(DAG.getConstant(BestHiQuad < 0 ? 1 : BestHiQuad, MVT::i64));
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, &MaskV[0], 2);
+    
     NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64,
-                       DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
-                       DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask);
+                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
+                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask);
     NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
 
-    // Now sort high and low parts separately.
-    BitVector InOrder(8);
-    if (BestLowQuad != -1) {
-      // Sort lower half in order using PSHUFLW.
-      MaskVec.clear();
-      bool AnyOutOrder = false;
-
-      for (unsigned i = 0; i != 4; ++i) {
-        SDValue Elt = MaskElts[i];
-        if (Elt.getOpcode() == ISD::UNDEF) {
-          MaskVec.push_back(Elt);
-          InOrder.set(i);
-        } else {
-          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-          if (EltIdx != i)
-            AnyOutOrder = true;
-
-          MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT));
-
-          // If this element is in the right place after this shuffle, then
-          // remember it.
-          if ((int)(EltIdx / 4) == BestLowQuad)
-            InOrder.set(i);
-        }
-      }
-      if (AnyOutOrder) {
-        for (unsigned i = 4; i != 8; ++i)
-          MaskVec.push_back(DAG.getConstant(i, MaskEVT));
-        SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 
-                                   &MaskVec[0], 8);
-        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, 
-                           NewV, NewV, Mask);
-      }
+    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+    // source words for the shuffle, to aid later transformations.
+    bool AllWordsInNewV = true;
+    for (unsigned i = 0; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+        continue;
+      AllWordsInNewV = false;
+      break;
     }
 
-    if (BestHighQuad != -1) {
-      // Sort high half in order using PSHUFHW if possible.
-      MaskVec.clear();
-
-      for (unsigned i = 0; i != 4; ++i)
-        MaskVec.push_back(DAG.getConstant(i, MaskEVT));
-
-      bool AnyOutOrder = false;
-      for (unsigned i = 4; i != 8; ++i) {
-        SDValue Elt = MaskElts[i];
-        if (Elt.getOpcode() == ISD::UNDEF) {
-          MaskVec.push_back(Elt);
-          InOrder.set(i);
-        } else {
-          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-          if (EltIdx != i)
-            AnyOutOrder = true;
-
-          MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT));
-
-          // If this element is in the right place after this shuffle, then
-          // remember it.
-          if ((int)(EltIdx / 4) == BestHighQuad)
-            InOrder.set(i);
-        }
-      }
-
-      if (AnyOutOrder) {
-        SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, 
-                                   MaskVT, &MaskVec[0], 8);
-        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, 
-                           NewV, NewV, Mask);
-      }
+    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+    if (AllWordsInNewV) {
+      for (int i = 0; i != 8; ++i) {
+        int idx = MaskVals[i];
+        if (idx < 0)
+          continue;
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 
+        if ((idx != i) && idx < 4)
+          pshufhw = false;
+        if ((idx != i) && idx > 3)
+          pshuflw = false;
+      }
+      V1 = NewV;
+      V2Used = false;
+      BestLoQuad = 0;
+      BestHiQuad = 1;
+    }
+
+    // If we've eliminated the use of V2, and the new mask is a pshuflw or
+    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
+    if (pshufhw || pshuflw) {
+      MaskV.clear();
+      for (unsigned i = 0; i != 8; ++i)
+        MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16)
+                                          : DAG.getConstant(MaskVals[i],
+                                                            MVT::i16));
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, 
+                         DAG.getUNDEF(MVT::v8i16), 
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16,
+                                     &MaskV[0], 8));
     }
-
-    // The other elements are put in the right place using pextrw and pinsrw.
+  }
+  
+  // If we have SSSE3, and all words of the result are from 1 input vector,
+  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
+  // is present, fall back to case 4.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+    
+    // If we have elements from both input vectors, set the high bit of the
+    // shuffle mask element to zero out elements that come from V2 in the V1 
+    // mask, and elements that come from V1 in the V2 mask, so that the two
+    // results can be OR'd together.
+    bool TwoInputs = V1Used && V2Used;
     for (unsigned i = 0; i != 8; ++i) {
-      if (InOrder[i])
+      int EltIdx = MaskVals[i] * 2;
+      if (TwoInputs && (EltIdx >= 16)) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
         continue;
-      SDValue Elt = MaskElts[i];
-      if (Elt.getOpcode() == ISD::UNDEF)
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+    }
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+    
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      if (EltIdx < 16) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
         continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-      SDValue ExtOp = (EltIdx < 8)
-        ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                      DAG.getConstant(EltIdx, PtrVT))
-        : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
-                      DAG.getConstant(EltIdx - 8, PtrVT));
-      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
-                         DAG.getConstant(i, PtrVT));
-    }
-
-    return NewV;
-  }
-
-  // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use as
-  // few as possible. First, let's find out how many elements are already in the
-  // right order.
-  unsigned V1InOrder = 0;
-  unsigned V1FromV1 = 0;
-  unsigned V2InOrder = 0;
-  unsigned V2FromV2 = 0;
-  SmallVector<SDValue, 8> V1Elts;
-  SmallVector<SDValue, 8> V2Elts;
-  for (unsigned i = 0; i < 8; ++i) {
-    SDValue Elt = MaskElts[i];
-    if (Elt.getOpcode() == ISD::UNDEF) {
-      V1Elts.push_back(Elt);
-      V2Elts.push_back(Elt);
-      ++V1InOrder;
-      ++V2InOrder;
-      continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
     }
-    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-    if (EltIdx == i) {
-      V1Elts.push_back(Elt);
-      V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
-      ++V1InOrder;
-    } else if (EltIdx == i+8) {
-      V1Elts.push_back(Elt);
-      V2Elts.push_back(DAG.getConstant(i, MaskEVT));
-      ++V2InOrder;
-    } else if (EltIdx < 8) {
-      V1Elts.push_back(Elt);
-      V2Elts.push_back(DAG.getConstant(EltIdx+8, MaskEVT));
-      ++V1FromV1;
-    } else {
-      V1Elts.push_back(Elt);
-      V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
-      ++V2FromV2;
+    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+  }
+
+  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+  // and update MaskVals with new element order.
+  BitVector InOrder(8);
+  if (BestLoQuad >= 0) {
+    SmallVector<SDValue, 8> MaskV;
+    for (int i = 0; i != 4; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        MaskV.push_back(DAG.getUNDEF(MVT::i16));
+        InOrder.set(i);
+      } else if ((idx / 4) == BestLoQuad) {
+        MaskV.push_back(DAG.getConstant(idx & 3, MVT::i16));
+        InOrder.set(i);
+      } else {
+        MaskV.push_back(DAG.getUNDEF(MVT::i16));
+      }
+    }
+    for (unsigned i = 4; i != 8; ++i)
+      MaskV.push_back(DAG.getConstant(i, MVT::i16));
+    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
+                       DAG.getUNDEF(MVT::v8i16),
+                       DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                   MVT::v8i16, &MaskV[0], 8));
+  }
+  
+  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+  // and update MaskVals with the new element order.
+  if (BestHiQuad >= 0) {
+    SmallVector<SDValue, 8> MaskV;
+    for (unsigned i = 0; i != 4; ++i)
+      MaskV.push_back(DAG.getConstant(i, MVT::i16));
+    for (unsigned i = 4; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        MaskV.push_back(DAG.getUNDEF(MVT::i16));
+        InOrder.set(i);
+      } else if ((idx / 4) == BestHiQuad) {
+        MaskV.push_back(DAG.getConstant((idx & 3) + 4, MVT::i16));
+        InOrder.set(i);
+      } else {
+        MaskV.push_back(DAG.getUNDEF(MVT::i16));
+      }
     }
+    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
+                       DAG.getUNDEF(MVT::v8i16),
+                       DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                   MVT::v8i16, &MaskV[0], 8));
   }
-
-  if (V2InOrder > V1InOrder) {
-    PermMask = CommuteVectorShuffleMask(PermMask, DAG, dl);
-    std::swap(V1, V2);
-    std::swap(V1Elts, V2Elts);
-    std::swap(V1FromV1, V2FromV2);
+  
+  // In case BestHi & BestLo were both -1, which means each quadword has a word
+  // from each of the four input quadwords, calculate the InOrder bitvector now
+  // before falling through to the insert/extract cleanup.
+  if (BestLoQuad == -1 && BestHiQuad == -1) {
+    NewV = V1;
+    for (int i = 0; i != 8; ++i)
+      if (MaskVals[i] < 0 || MaskVals[i] == i)
+        InOrder.set(i);
   }
+  
+  // The other elements are put in the right place using pextrw and pinsrw.
+  for (unsigned i = 0; i != 8; ++i) {
+    if (InOrder[i])
+      continue;
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    SDValue ExtOp = (EltIdx < 8)
+    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx))
+    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+                  DAG.getIntPtrConstant(EltIdx - 8));
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+                       DAG.getIntPtrConstant(i));
+  }
+  return NewV;
+}
 
-  if ((V1FromV1 + V1InOrder) != 8) {
-    // Some elements are from V2.
-    if (V1FromV1) {
-      // If there are elements that are from V1 but out of place,
-      // then first sort them in place
-      SmallVector<SDValue, 8> MaskVec;
-      for (unsigned i = 0; i < 8; ++i) {
-        SDValue Elt = V1Elts[i];
-        if (Elt.getOpcode() == ISD::UNDEF) {
-          MaskVec.push_back(DAG.getUNDEF(MaskEVT));
-          continue;
-        }
-        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-        if (EltIdx >= 8)
-          MaskVec.push_back(DAG.getUNDEF(MaskEVT));
-        else
-          MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2,
+                                 SDValue PermMask, SelectionDAG &DAG,
+                                 X86TargetLowering &TLI, DebugLoc dl) {
+  SmallVector<SDValue, 16> MaskElts(PermMask.getNode()->op_begin(),
+                                    PermMask.getNode()->op_end());
+  SmallVector<int, 16> MaskVals;
+  
+  // If we have SSSE3, case 1 is generated when all result bytes come from
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is 
+  // present, fall back to case 3.
+  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
+  bool V1Only = true;
+  bool V2Only = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    SDValue Elt = MaskElts[i];
+    int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : 
+                 cast<ConstantSDNode>(Elt)->getZExtValue();
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0)
+      continue;
+    if (EltIdx < 16)
+      V2Only = false;
+    else
+      V1Only = false;
+  }
+  
+  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+    
+    // If all result elements are from one input vector, then only translate
+    // undef mask values to 0x80 (zero out result) in the pshufb mask. 
+    //
+    // Otherwise, we have elements from both input vectors, and must zero out
+    // elements that come from V2 in the first mask, and V1 in the second mask
+    // so that we can OR them together.
+    bool TwoInputs = !(V1Only || V2Only);
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
       }
-      SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], 8);
-      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, V1, V1, Mask);
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
     }
-
-    NewV = V1;
-    for (unsigned i = 0; i < 8; ++i) {
-      SDValue Elt = V1Elts[i];
-      if (Elt.getOpcode() == ISD::UNDEF)
-        continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-      if (EltIdx < 8)
+    // If all the elements are from V2, assign it to V1 and return after
+    // building the first pshufb.
+    if (V2Only)
+      V1 = V2;
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return V1;
+    
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 16) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
         continue;
-      SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
-                                    DAG.getConstant(EltIdx - 8, PtrVT));
-      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
-                         DAG.getConstant(i, PtrVT));
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
     }
-    return NewV;
-  } else {
-    // All elements are from V1.
-    NewV = V1;
-    for (unsigned i = 0; i < 8; ++i) {
-      SDValue Elt = V1Elts[i];
-      if (Elt.getOpcode() == ISD::UNDEF)
-        continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
-      SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                                    DAG.getConstant(EltIdx, PtrVT));
-      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
-                         DAG.getConstant(i, PtrVT));
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+  }
+  
+  // No SSSE3 - Calculate in place words and then fix all out of place words
+  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
+  // the 16 different words that comprise the two doublequadword input vectors.
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
+  SDValue NewV = V2Only ? V2 : V1;
+  for (int i = 0; i != 8; ++i) {
+    int Elt0 = MaskVals[i*2];
+    int Elt1 = MaskVals[i*2+1];
+    
+    // This word of the result is all undef, skip it.
+    if (Elt0 < 0 && Elt1 < 0)
+      continue;
+    
+    // This word of the result is already in the correct place, skip it.
+    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
+      continue;
+    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+      continue;
+    
+    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+    SDValue InsElt;
+    
+    // If Elt1 is defined, extract it from the appropriate source.  If the
+    // source byte is not also odd, shift the extracted word left 8 bits.
+    if (Elt1 >= 0) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      if ((Elt1 & 1) == 0)
+        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+                             DAG.getConstant(8, TLI.getShiftAmountTy()));
+    }
+    // If Elt0 is defined, extract it from the appropriate source.  If the
+    // source byte is not also even, shift the extracted word right 8 bits. If
+    // Elt1 was also defined, OR the extracted values together before
+    // inserting them in the result.
+    if (Elt0 >= 0) {
+      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+      if ((Elt0 & 1) != 0)
+        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+                              DAG.getConstant(8, TLI.getShiftAmountTy()));
+      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+                         : InsElt0;
     }
-    return NewV;
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                       DAG.getIntPtrConstant(i));
   }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
 }
 
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
@@ -4102,6 +4195,8 @@
   bool V1IsSplat = false;
   bool V2IsSplat = false;
 
+  // FIXME: Check for legal shuffle and return?
+  
   if (isUndefShuffle(Op.getNode()))
     return DAG.getUNDEF(VT);
 
@@ -4263,6 +4358,7 @@
       return Op;
   }
 
+  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
   // Try PSHUF* first, then SHUFP*.
   // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically
   // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
@@ -4305,6 +4401,12 @@
       return NewOp;
   }
 
+  if (VT == MVT::v16i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(V1, V2, PermMask, DAG, *this, dl);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+  
   // Handle all 4 wide cases with a number of shuffles except for MMX.
   if (NumElems == 4 && !isMMX)
     return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG, dl);
@@ -6857,6 +6959,7 @@
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
@@ -6975,12 +7078,14 @@
 bool
 X86TargetLowering::isShuffleMaskLegal(SDValue Mask, MVT VT) const {
   // Only do shuffles on 128-bit vector types for now.
+  // FIXME: pshufb, blends
   if (VT.getSizeInBits() == 64) return false;
   return (Mask.getNode()->getNumOperands() <= 4 ||
           isIdentityMask(Mask.getNode()) ||
           isIdentityMask(Mask.getNode(), true) ||
           isSplatMask(Mask.getNode())  ||
-          isPSHUFHW_PSHUFLWMask(Mask.getNode()) ||
+          X86::isPSHUFHWMask(Mask.getNode()) ||
+          X86::isPSHUFLWMask(Mask.getNode()) ||
           X86::isUNPCKLMask(Mask.getNode()) ||
           X86::isUNPCKHMask(Mask.getNode()) ||
           X86::isUNPCKL_v_undef_Mask(Mask.getNode()) ||

Modified: llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.h?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/branches/Apple/Dib/lib/Target/X86/X86ISelLowering.h Thu Feb 26 17:39:22 2009
@@ -176,6 +176,9 @@
       /// corresponds to X86::PINSRW.
       PINSRW,
 
+      /// PSHUFB - Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
       /// FMAX, FMIN - Floating point max and min.
       ///
       FMAX, FMIN,

Modified: llvm/branches/Apple/Dib/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/lib/Target/X86/X86InstrSSE.td?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/branches/Apple/Dib/lib/Target/X86/X86InstrSSE.td Thu Feb 26 17:39:22 2009
@@ -36,6 +36,9 @@
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB", 
+                 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
@@ -2841,6 +2844,11 @@
                               imm:$src3))]>, OpSize;
 }
 
+def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
+def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//

Modified: llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-12.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-12.ll?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-12.ll (original)
+++ llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-12.ll Thu Feb 26 17:39:22 2009
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t
 ; RUN: not grep punpck %t
 ; RUN: grep pextrw %t | count 4
 ; RUN: grep pinsrw %t | count 6
-; RUN: grep pshuflw %t | count 3
+; RUN: grep pshuflw %t | count 1
 ; RUN: grep pshufhw %t | count 2
 
 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {

Modified: llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-13.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-13.ll?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-13.ll (original)
+++ llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-13.ll Thu Feb 26 17:39:22 2009
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 > %t
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah > %t
 ; RUN: grep movlhps %t | count 1
-; RUN: grep movss %t | count 1
 ; RUN: grep pshufd %t | count 1
+; RUN: grep movss %t | count 1
 ; RUN: grep pshuflw %t | count 1
 ; RUN: grep pshufhw %t | count 1
 

Modified: llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-2.ll?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-2.ll (original)
+++ llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-2.ll Thu Feb 26 17:39:22 2009
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
 ; RUN: grep pshufhw %t | count 1
 ; RUN: grep pshuflw %t | count 1
 ; RUN: grep movhps  %t | count 1

Modified: llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-21.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-21.ll?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-21.ll (original)
+++ llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-21.ll Thu Feb 26 17:39:22 2009
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
 ; RUN: grep pshuflw %t | count 1
 ; RUN: grep pextrw %t | count 2
 ; RUN: grep pinsrw %t | count 2

Modified: llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-28.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-28.ll?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-28.ll (original)
+++ llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-28.ll Thu Feb 26 17:39:22 2009
@@ -1,8 +1,12 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
-; RUN: grep punpcklwd %t | count 1
-; RUN: grep pextrw %t | count 6
-; RUN: grep pinsrw %t | count 8
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
+; RUN: grep movd %t | count 1
+; RUN: grep pshuflw %t | count 1
+; RUN: grep pinsrw %t | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=core2 -o %t -f
+; RUN: grep pshufb %t | count 1
 
+; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently.
+;        Don't XFAIL it because it's still better than the previous code.
 
 ; Pack various elements via shuffles.
 define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
@@ -10,24 +14,3 @@
 	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
 	ret <8 x i16> %tmp7
 }
-
-
-define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
-	ret <8 x i16> %tmp8
-}
-
-
-define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
-	ret <8 x i16> %tmp9
-}
-
-
-define <8 x i16> @shuf4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
-	ret <8 x i16> %tmp9
-}

Modified: llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-29.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-29.ll?rev=65588&r1=65587&r2=65588&view=diff

==============================================================================
--- llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-29.ll (original)
+++ llvm/branches/Apple/Dib/test/CodeGen/X86/vec_shuffle-29.ll Thu Feb 26 17:39:22 2009
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -disable-mmx -o %t -f
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41,-ssse3 -disable-mmx -o %t -f
 ; RUN: not grep pextrw %t
 ; RUN: grep pinsrw %t