[llvm-commits] [llvm] r44836 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp test/CodeGen/X86/vec_shuffle-12.ll test/CodeGen/X86/vec_shuffle-13.ll

Mon Dec 10 17:46:18 PST 2007

Author: evancheng
Date: Mon Dec 10 19:46:18 2007
New Revision: 44836

URL: http://llvm.org/viewvc/llvm-project?rev=44836&view=rev
Log:
- Improved v8i16 shuffle lowering. It now uses pshuflw and pshufhw as much as
possible before resorting to pextrw and pinsrw.
- Better codegen for v4i32 shuffles masquerading as v8i16 or v16i8 shuffles.
- Improves (i16 extract_vector_element 0) codegen by recognizing
  (i32 extract_vector_element 0) does not require a pextrw.

Added:
    llvm/trunk/test/CodeGen/X86/vec_shuffle-13.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vec_shuffle-12.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=44836&r1=44835&r2=44836&view=diff

==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Dec 10 19:46:18 2007
@@ -23,6 +23,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -35,6 +36,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ParameterAttributes.h"
 using namespace llvm;
@@ -2714,7 +2716,7 @@
     if (Arg.getOpcode() == ISD::UNDEF) continue;
     assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
     unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
-    if (Val > 4)
+    if (Val >= 4)
       return false;
   }
 
@@ -3130,6 +3132,8 @@
   return V;
 }
 
+/// is4WideVector - Returns true if the specific v8i16 or v16i8 vector is
+/// actually just a 4 wide vector. e.g. <a, a, y, y, d, d, x, x>
 SDOperand
 X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
   // All zero's are handled with pxor, all one's are handled with pcmpeqd.
@@ -3154,7 +3158,7 @@
   unsigned NumNonZero = 0;
   unsigned NonZeros = 0;
   unsigned NumNonZeroImms = 0;
-  std::set<SDOperand> Values;
+  SmallSet<SDOperand, 8> Values;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDOperand Elt = Op.getOperand(i);
     if (Elt.getOpcode() != ISD::UNDEF) {
@@ -3314,59 +3318,179 @@
 SDOperand LowerVECTOR_SHUFFLEv8i16(SDOperand V1, SDOperand V2,
                                    SDOperand PermMask, SelectionDAG &DAG,
                                    TargetLowering &TLI) {
+  SDOperand NewV;
   MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(8);
   MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
-  if (isPSHUFHW_PSHUFLWMask(PermMask.Val)) {
-    // Handle v8i16 shuffle high / low shuffle node pair.
+  MVT::ValueType PtrVT = TLI.getPointerTy();
+  SmallVector<SDOperand, 8> MaskElts(PermMask.Val->op_begin(),
+                                     PermMask.Val->op_end());
+
+  // First record which half of which vector the low elements come from.
+  SmallVector<unsigned, 4> LowQuad(4);
+  for (unsigned i = 0; i < 4; ++i) {
+    SDOperand Elt = MaskElts[i];
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    int QuadIdx = EltIdx / 4;
+    ++LowQuad[QuadIdx];
+  }
+  int BestLowQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LowQuad[i] > MaxQuad) {
+      BestLowQuad = i;
+      MaxQuad = LowQuad[i];
+    }
+  }
+
+  // Record which half of which vector the high elements come from.
+  SmallVector<unsigned, 4> HighQuad(4);
+  for (unsigned i = 4; i < 8; ++i) {
+    SDOperand Elt = MaskElts[i];
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    int QuadIdx = EltIdx / 4;
+    ++HighQuad[QuadIdx];
+  }
+  int BestHighQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HighQuad[i] > MaxQuad) {
+      BestHighQuad = i;
+      MaxQuad = HighQuad[i];
+    }
+  }
+
+  // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it.
+  if (BestLowQuad != -1 || BestHighQuad != -1) {
+    // First sort the 4 chunks in order using shufpd.
     SmallVector<SDOperand, 8> MaskVec;
-    for (unsigned i = 0; i != 4; ++i)
-      MaskVec.push_back(PermMask.getOperand(i));
-    for (unsigned i = 4; i != 8; ++i)
-      MaskVec.push_back(DAG.getConstant(i, MaskEVT));
-    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
-    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V2, Mask);
-    MaskVec.clear();
-    for (unsigned i = 0; i != 4; ++i)
-      MaskVec.push_back(DAG.getConstant(i, MaskEVT));
-    for (unsigned i = 4; i != 8; ++i)
-      MaskVec.push_back(PermMask.getOperand(i));
-    Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
-    return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V2, Mask);
+    if (BestLowQuad != -1)
+      MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32));
+    else
+      MaskVec.push_back(DAG.getConstant(0, MVT::i32));
+    if (BestHighQuad != -1)
+      MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32));
+    else
+      MaskVec.push_back(DAG.getConstant(1, MVT::i32));
+    SDOperand Mask= DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec[0],2);
+    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
+                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V1),
+                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V2), Mask);
+    NewV = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, NewV);
+
+    // Now sort high and low parts separately.
+    BitVector InOrder(8);
+    if (BestLowQuad != -1) {
+      // Sort lower half in order using PSHUFLW.
+      MaskVec.clear();
+      bool AnyOutOrder = false;
+      for (unsigned i = 0; i != 4; ++i) {
+        SDOperand Elt = MaskElts[i];
+        if (Elt.getOpcode() == ISD::UNDEF) {
+          MaskVec.push_back(Elt);
+          InOrder.set(i);
+        } else {
+          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+          if (EltIdx != i)
+            AnyOutOrder = true;
+          MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT));
+          // If this element is in the right place after this shuffle, then
+          // remember it.
+          if ((int)(EltIdx / 4) == BestLowQuad)
+            InOrder.set(i);
+        }
+      }
+      if (AnyOutOrder) {
+        for (unsigned i = 4; i != 8; ++i)
+          MaskVec.push_back(DAG.getConstant(i, MaskEVT));
+        SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
+        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
+      }
+    }
+
+    if (BestHighQuad != -1) {
+      // Sort high half in order using PSHUFHW if possible.
+      MaskVec.clear();
+      for (unsigned i = 0; i != 4; ++i)
+        MaskVec.push_back(DAG.getConstant(i, MaskEVT));
+      bool AnyOutOrder = false;
+      for (unsigned i = 4; i != 8; ++i) {
+        SDOperand Elt = MaskElts[i];
+        if (Elt.getOpcode() == ISD::UNDEF) {
+          MaskVec.push_back(Elt);
+          InOrder.set(i);
+        } else {
+          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+          if (EltIdx != i)
+            AnyOutOrder = true;
+          MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT));
+          // If this element is in the right place after this shuffle, then
+          // remember it.
+          if ((int)(EltIdx / 4) == BestHighQuad)
+            InOrder.set(i);
+        }
+      }
+      if (AnyOutOrder) {
+        SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
+        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
+      }
+    }
+
+    // The other elements are put in the right place using pextrw and pinsrw.
+    for (unsigned i = 0; i != 8; ++i) {
+      if (InOrder[i])
+        continue;
+      SDOperand Elt = MaskElts[i];
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      if (EltIdx == i)
+        continue;
+      SDOperand ExtOp = (EltIdx < 8)
+        ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
+                      DAG.getConstant(EltIdx, PtrVT))
+        : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
+                      DAG.getConstant(EltIdx - 8, PtrVT));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
+                         DAG.getConstant(i, PtrVT));
+    }
+    return NewV;
   }
 
-  // Lower than into extracts and inserts but try to do as few as possible.
+  // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use
+  ///as few as possible.
   // First, let's find out how many elements are already in the right order.
   unsigned V1InOrder = 0;
   unsigned V1FromV1 = 0;
   unsigned V2InOrder = 0;
   unsigned V2FromV2 = 0;
-  SmallVector<unsigned, 8> V1Elts;
-  SmallVector<unsigned, 8> V2Elts;
+  SmallVector<SDOperand, 8> V1Elts;
+  SmallVector<SDOperand, 8> V2Elts;
   for (unsigned i = 0; i < 8; ++i) {
-    SDOperand Elt = PermMask.getOperand(i);
+    SDOperand Elt = MaskElts[i];
     if (Elt.getOpcode() == ISD::UNDEF) {
-      V1Elts.push_back(i);
-      V2Elts.push_back(i);
+      V1Elts.push_back(Elt);
+      V2Elts.push_back(Elt);
       ++V1InOrder;
       ++V2InOrder;
+      continue;
+    }
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    if (EltIdx == i) {
+      V1Elts.push_back(Elt);
+      V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
+      ++V1InOrder;
+    } else if (EltIdx == i+8) {
+      V1Elts.push_back(Elt);
+      V2Elts.push_back(DAG.getConstant(i, MaskEVT));
+      ++V2InOrder;
+    } else if (EltIdx < 8) {
+      V1Elts.push_back(Elt);
+      ++V1FromV1;
     } else {
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
-      if (EltIdx == i) {
-        V1Elts.push_back(i);
-        V2Elts.push_back(i+8);
-        ++V1InOrder;
-      } else if (EltIdx == i+8) {
-        V1Elts.push_back(i+8);
-        V2Elts.push_back(i);
-        ++V2InOrder;
-      } else {
-        V1Elts.push_back(EltIdx);
-        V2Elts.push_back(EltIdx);
-        if (EltIdx < 8)
-          ++V1FromV1;
-        else
-          ++V2FromV2;
-      }
+      V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
+      ++V2FromV2;
     }
   }
 
@@ -3377,33 +3501,92 @@
     std::swap(V1FromV1, V2FromV2);
   }
 
-  MVT::ValueType PtrVT = TLI.getPointerTy();
-  if (V1FromV1) {
-    // If there are elements that are from V1 but out of place,
-    // then first sort them in place
-    SmallVector<SDOperand, 8> MaskVec;
+  if ((V1FromV1 + V1InOrder) != 8) {
+    // Some elements are from V2.
+    if (V1FromV1) {
+      // If there are elements that are from V1 but out of place,
+      // then first sort them in place
+      SmallVector<SDOperand, 8> MaskVec;
+      for (unsigned i = 0; i < 8; ++i) {
+        SDOperand Elt = V1Elts[i];
+        if (Elt.getOpcode() == ISD::UNDEF) {
+          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+          continue;
+        }
+        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+        if (EltIdx >= 8)
+          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+        else
+          MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
+      }
+      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
+      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask);
+    }
+
+    NewV = V1;
     for (unsigned i = 0; i < 8; ++i) {
-      unsigned EltIdx = V1Elts[i];
-      if (EltIdx >= 8)
-        MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
-      else
-        MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
+      SDOperand Elt = V1Elts[i];
+      if (Elt.getOpcode() == ISD::UNDEF)
+        continue;
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      if (EltIdx < 8)
+        continue;
+      SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
+                                    DAG.getConstant(EltIdx - 8, PtrVT));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
+                         DAG.getConstant(i, PtrVT));
     }
-    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
-    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask);
+    return NewV;
+  } else {
+    // All elements are from V1.
+    NewV = V1;
+    for (unsigned i = 0; i < 8; ++i) {
+      SDOperand Elt = V1Elts[i];
+      if (Elt.getOpcode() == ISD::UNDEF)
+        continue;
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
+                                    DAG.getConstant(EltIdx, PtrVT));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
+                         DAG.getConstant(i, PtrVT));
+    }
+    return NewV;
   }
+}
 
-  // Now let's insert elements from the other vector.
-  for (unsigned i = 0; i < 8; ++i) {
-    unsigned EltIdx = V1Elts[i];
-    if (EltIdx < 8)
-      continue;
-    SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
-                                  DAG.getConstant(EltIdx - 8, PtrVT));
-    V1 = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V1, ExtOp,
-                     DAG.getConstant(i, PtrVT));
+/// RewriteAs4WideShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones if possible. This can be done when every pair / quad of shuffle mask
+/// elements point to elements in the right sequence. e.g.
+/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
+static
+SDOperand RewriteAs4WideShuffle(SDOperand V1, SDOperand V2,
+                                SDOperand PermMask, SelectionDAG &DAG,
+                                TargetLowering &TLI) {
+  unsigned NumElems = PermMask.getNumOperands();
+  unsigned Scale = NumElems / 4;
+  SmallVector<SDOperand, 4> MaskVec;
+  for (unsigned i = 0; i < NumElems; i += Scale) {
+    unsigned StartIdx = ~0U;
+    for (unsigned j = 0; j < Scale; ++j) {
+      SDOperand Elt = PermMask.getOperand(i+j);
+      if (Elt.getOpcode() == ISD::UNDEF)
+        continue;
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      if (StartIdx == ~0U)
+        StartIdx = EltIdx - (EltIdx % Scale);
+      if (EltIdx != StartIdx + j)
+        return SDOperand();
+    }
+    if (StartIdx == ~0U)
+      MaskVec.push_back(DAG.getNode(ISD::UNDEF, MVT::i32));
+    else
+      MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MVT::i32));
   }
-  return V1;
+
+  V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V2);
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, &MaskVec[0],4));
 }
 
 SDOperand
@@ -3544,18 +3727,31 @@
     }
   }
 
+  // If the shuffle can be rewritten as a 4 wide shuffle, then do it!
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDOperand NewOp = RewriteAs4WideShuffle(V1, V2, PermMask, DAG, *this);
+    if (NewOp.Val)
+      return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
+  }
+
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
-  if (VT == MVT::v8i16)
-    return LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this);
+  if (VT == MVT::v8i16) {
+    SDOperand NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this);
+    if (NewOp.Val)
+      return NewOp;
+  }
 
-  if (NumElems == 4 &&  MVT::getSizeInBits(VT) != 64) {
+  // Handle all 4 wide cases with a number of shuffles.
+  if (NumElems == 4 && MVT::getSizeInBits(VT) != 64) {
     // Don't do this for MMX.
     MVT::ValueType MaskVT = PermMask.getValueType();
     MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
     SmallVector<std::pair<int, int>, 8> Locs;
     Locs.reserve(NumElems);
-    SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
-    SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand, 8> Mask1(NumElems,
+                                    DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand, 8> Mask2(NumElems,
+                                    DAG.getNode(ISD::UNDEF, MaskEVT));
     unsigned NumHi = 0;
     unsigned NumLo = 0;
     // If no more than two elements come from either vector. This can be
@@ -3661,6 +3857,13 @@
   MVT::ValueType VT = Op.getValueType();
   // TODO: handle v16i8.
   if (MVT::getSizeInBits(VT) == 16) {
+    SDOperand Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
+                                 DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
     // Transform it so it match pextrw which produces a 32-bit result.
     MVT::ValueType EVT = (MVT::ValueType)(VT+1);
     SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
@@ -3669,7 +3872,6 @@
                                     DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, VT, Assert);
   } else if (MVT::getSizeInBits(VT) == 32) {
-    SDOperand Vec = Op.getOperand(0);
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
     if (Idx == 0)
       return Op;
@@ -3686,12 +3888,12 @@
       push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
     SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                  &IdxVec[0], IdxVec.size());
+    SDOperand Vec = Op.getOperand(0);
     Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                       Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                        DAG.getConstant(0, getPointerTy()));
   } else if (MVT::getSizeInBits(VT) == 64) {
-    SDOperand Vec = Op.getOperand(0);
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
     if (Idx == 0)
       return Op;
@@ -3706,6 +3908,7 @@
       push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
     SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                  &IdxVec[0], IdxVec.size());
+    SDOperand Vec = Op.getOperand(0);
     Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                       Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,

Modified: llvm/trunk/test/CodeGen/X86/vec_shuffle-12.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_shuffle-12.ll?rev=44836&r1=44835&r2=44836&view=diff

==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_shuffle-12.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_shuffle-12.ll Mon Dec 10 19:46:18 2007
@@ -1,37 +1,28 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep punpck
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pextrw | count 7
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pinsrw | count 7
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuf | count 2
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pextrw | count 4
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pinsrw | count 6
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuflw | count 3
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufhw | count 2
 
-define void @t1(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
+define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) {
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+	ret <8 x i16> %tmp3
 }
 
-define void @t2(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
+	ret <8 x i16> %tmp
 }
 
-define void @t3(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
+	ret <8 x i16> %tmp
 }
 
-define void @t4(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
+	ret <8 x i16> %tmp
 }

Added: llvm/trunk/test/CodeGen/X86/vec_shuffle-13.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_shuffle-13.ll?rev=44836&view=auto

==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_shuffle-13.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vec_shuffle-13.ll Mon Dec 10 19:46:18 2007
@@ -0,0 +1,21 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movlhps | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movss | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuflw | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufhw | count 1
+
+define <8 x i16> @t1(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
+	ret <8 x i16> %tmp
+}
+
+define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
+	ret <8 x i16> %tmp
+}
+
+define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
+	ret <8 x i16> %tmp
+}