[llvm] r237689 - [mips] Correct and improve special-case shuffle instructions.

Tue May 19 05:24:52 PDT 2015

Author: dsanders
Date: Tue May 19 07:24:52 2015
New Revision: 237689

URL: http://llvm.org/viewvc/llvm-project?rev=237689&view=rev
Log:
[mips] Correct and improve special-case shuffle instructions.

Summary:
The documentation writes vectors highest-index first whereas LLVM-IR writes
them lowest-index first. As a result, instructions defined in terms of
left_half() and right_half() had the halves reversed.

In addition to correcting them, they have been improved to allow shuffles
that use the same operand twice or in reverse order. For example, ilvev
used to accept masks of the form:
  <0, n, 2, n+2, 4, n+4, ...>
but now accepts:
  <0, 0, 2, 2, 4, 4, ...>
  <n, n, n+2, n+2, n+4, n+4, ...>
  <0, n, 2, n+2, 4, n+4, ...>
  <n, 0, n+2, 2, n+4, 4, ...>

One further improvement is that splati.[bhwd] is now the preferred instruction
for splat-like operations. The other special shuffles are no longer used
for splats. This lead to the discovery that <0, 0, ...> would not cause
splati.[hwd] to be selected and this has also been fixed.

This fixes the enc-3des test from the test-suite on Mips64r6 with MSA.

Reviewers: vkalintiris

Reviewed By: vkalintiris

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D9660

Modified:
    llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.cpp
    llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.h
    llvm/trunk/lib/Target/Mips/MipsMSAInstrInfo.td
    llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
    llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.h
    llvm/trunk/lib/Target/Mips/MipsSEISelLowering.cpp
    llvm/trunk/test/CodeGen/Mips/msa/shuffle.ll

Modified: llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.cpp?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================

--- llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.cpp Tue May 19 07:24:52 2015
@@ -113,7 +113,8 @@ bool MipsDAGToDAGISel::selectAddr16(SDNo
   return false;
 }
 
-bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
+                                    unsigned MinSizeInBits) const {
   llvm_unreachable("Unimplemented function.");
   return false;
 }

Modified: llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.h?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================
--- llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.h (original)
+++ llvm/trunk/lib/Target/Mips/MipsISelDAGToDAG.h Tue May 19 07:24:52 2015
@@ -84,7 +84,8 @@ private:
                             SDValue &Offset, SDValue &Alias);
 
   /// \brief Select constant vector splats.
-  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  virtual bool selectVSplat(SDNode *N, APInt &Imm,
+                            unsigned MinSizeInBits) const;
   /// \brief Select constant vector splats whose value fits in a uimm1.
   virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
   /// \brief Select constant vector splats whose value fits in a uimm2.

Modified: llvm/trunk/lib/Target/Mips/MipsMSAInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Mips/MipsMSAInstrInfo.td?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================
--- llvm/trunk/lib/Target/Mips/MipsMSAInstrInfo.td (original)
+++ llvm/trunk/lib/Target/Mips/MipsMSAInstrInfo.td Tue May 19 07:24:52 2015
@@ -375,7 +375,7 @@ def vsplat_imm_eq_1 : PatLeaf<(build_vec
   APInt Imm;
   EVT EltTy = N->getValueType(0).getVectorElementType();
 
-  return selectVSplat (N, Imm) &&
+  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
          Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
 }]>;
 
@@ -384,7 +384,7 @@ def vsplati64_imm_eq_1 : PatLeaf<(bitcon
   SDNode *BV = N->getOperand(0).getNode();
   EVT EltTy = N->getValueType(0).getVectorElementType();
 
-  return selectVSplat (BV, Imm) &&
+  return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
          Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
 }]>;
 

Modified: llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.cpp?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================
--- llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.cpp Tue May 19 07:24:52 2015
@@ -473,7 +473,8 @@ bool MipsSEDAGToDAGISel::selectIntAddrMS
 // Returns true and sets Imm if:
 // * MSA is enabled
 // * N is a ISD::BUILD_VECTOR representing a constant splat
-bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
+                                      unsigned MinSizeInBits) const {
   if (!Subtarget->hasMSA())
     return false;
 
@@ -486,9 +487,8 @@ bool MipsSEDAGToDAGISel::selectVSplat(SD
   unsigned SplatBitSize;
   bool HasAnyUndefs;
 
-  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                             HasAnyUndefs, 8,
-                             !Subtarget->isLittle()))
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             MinSizeInBits, !Subtarget->isLittle()))
     return false;
 
   Imm = SplatValue;
@@ -521,8 +521,9 @@ selectVSplatCommon(SDValue N, SDValue &I
   if (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0);
 
-  if (selectVSplat (N.getNode(), ImmValue) &&
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+
     if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
         (!Signed && ImmValue.isIntN(ImmBitSize))) {
       Imm = CurDAG->getTargetConstant(ImmValue, SDLoc(N), EltTy);
@@ -596,7 +597,7 @@ bool MipsSEDAGToDAGISel::selectVSplatUim
   if (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0);
 
-  if (selectVSplat (N.getNode(), ImmValue) &&
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
     int32_t Log2 = ImmValue.exactLogBase2();
 
@@ -627,7 +628,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMas
   if (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0);
 
-  if (selectVSplat(N.getNode(), ImmValue) &&
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
     // Extract the run of set bits starting with bit zero from the bitwise
     // inverse of ImmValue, and test that the inverse of this is the same
@@ -661,7 +662,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMas
   if (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0);
 
-  if (selectVSplat(N.getNode(), ImmValue) &&
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
     // Extract the run of set bits starting with bit zero, and test that the
     // result is the same as the original value
@@ -683,7 +684,7 @@ bool MipsSEDAGToDAGISel::selectVSplatUim
   if (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0);
 
-  if (selectVSplat(N.getNode(), ImmValue) &&
+  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
       ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
     int32_t Log2 = (~ImmValue).exactLogBase2();
 

Modified: llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.h?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================
--- llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.h (original)
+++ llvm/trunk/lib/Target/Mips/MipsSEISelDAGToDAG.h Tue May 19 07:24:52 2015
@@ -78,7 +78,8 @@ private:
                         SDValue &Offset) const override;
 
   /// \brief Select constant vector splats.
-  bool selectVSplat(SDNode *N, APInt &Imm) const override;
+  bool selectVSplat(SDNode *N, APInt &Imm,
+                    unsigned MinSizeInBits) const override;
   /// \brief Select constant vector splats whose value fits in a given integer.
   bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
                                   unsigned ImmBitSize) const;

Modified: llvm/trunk/lib/Target/Mips/MipsSEISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Mips/MipsSEISelLowering.cpp?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================
--- llvm/trunk/lib/Target/Mips/MipsSEISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/Mips/MipsSEISelLowering.cpp Tue May 19 07:24:52 2015
@@ -2410,7 +2410,7 @@ SDValue MipsSETargetLowering::lowerBUILD
 // It is therefore possible to lower into SHF when the mask takes the form:
 //   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
 // When undef's appear they are treated as if they were whatever value is
-// necessary in order to fit the above form.
+// necessary in order to fit the above forms.
 //
 // For example:
 //   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
@@ -2469,177 +2469,326 @@ static SDValue lowerVECTOR_SHUFFLE_SHF(S
                      DAG.getConstant(Imm, DL, MVT::i32), Op->getOperand(0));
 }
 
+/// Determine whether a range fits a regular pattern of values.
+/// This function accounts for the possibility of jumping over the End iterator.
+template <typename ValType>
+static bool
+fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
+                   unsigned CheckStride,
+                   typename SmallVectorImpl<ValType>::const_iterator End,
+                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
+  auto &I = Begin;
+
+  while (I != End) {
+    if (*I != -1 && *I != ExpectedIndex)
+      return false;
+    ExpectedIndex += ExpectedIndexStride;
+
+    // Incrementing past End is undefined behaviour so we must increment one
+    // step at a time and check for End at each step.
+    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
+      ; // Empty loop body.
+  }
+  return true;
+}
+
+// Determine whether VECTOR_SHUFFLE is a SPLATI.
+//
+// It is a SPLATI when the mask is:
+//   <x, x, x, ...>
+// where x is any valid index.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static bool isVECTOR_SHUFFLE_SPLATI(SDValue Op, EVT ResTy,
+                                    SmallVector<int, 16> Indices,
+                                    SelectionDAG &DAG) {
+  assert((Indices.size() % 2) == 0);
+
+  int SplatIndex = -1;
+  for (const auto &V : Indices) {
+    if (V != -1) {
+      SplatIndex = V;
+      break;
+    }
+  }
+
+  return fitsRegularPattern<int>(Indices.begin(), 1, Indices.end(), SplatIndex,
+                                 0);
+}
+
 // Lower VECTOR_SHUFFLE into ILVEV (if possible).
 //
 // ILVEV interleaves the even elements from each vector.
 //
-// It is possible to lower into ILVEV when the mask takes the form:
-//   <0, n, 2, n+2, 4, n+4, ...>
+// It is possible to lower into ILVEV when the mask consists of two of the
+// following forms interleaved:
+//   <0, 2, 4, ...>
+//   <n, n+2, n+4, ...>
 // where n is the number of elements in the vector.
+// For example:
+//   <0, 0, 2, 2, 4, 4, ...>
+//   <0, n, 2, n+2, 4, n+4, ...>
 //
 // When undef's appear in the mask they are treated as if they were whatever
-// value is necessary in order to fit the above form.
+// value is necessary in order to fit the above forms.
 static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
                                          SmallVector<int, 16> Indices,
                                          SelectionDAG &DAG) {
-  assert ((Indices.size() % 2) == 0);
-  int WsIdx = 0;
-  int WtIdx = ResTy.getVectorNumElements();
-
-  for (unsigned i = 0; i < Indices.size(); i += 2) {
-    if (Indices[i] != -1 && Indices[i] != WsIdx)
-      return SDValue();
-    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
-      return SDValue();
-    WsIdx += 2;
-    WtIdx += 2;
-  }
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the even elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size(), 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the even elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size(), 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
 
-  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Op->getOperand(0),
-                     Op->getOperand(1));
+  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Ws, Wt);
 }
 
 // Lower VECTOR_SHUFFLE into ILVOD (if possible).
 //
 // ILVOD interleaves the odd elements from each vector.
 //
-// It is possible to lower into ILVOD when the mask takes the form:
-//   <1, n+1, 3, n+3, 5, n+5, ...>
+// It is possible to lower into ILVOD when the mask consists of two of the
+// following forms interleaved:
+//   <1, 3, 5, ...>
+//   <n+1, n+3, n+5, ...>
 // where n is the number of elements in the vector.
+// For example:
+//   <1, 1, 3, 3, 5, 5, ...>
+//   <1, n+1, 3, n+3, 5, n+5, ...>
 //
 // When undef's appear in the mask they are treated as if they were whatever
-// value is necessary in order to fit the above form.
+// value is necessary in order to fit the above forms.
 static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
                                          SmallVector<int, 16> Indices,
                                          SelectionDAG &DAG) {
-  assert ((Indices.size() % 2) == 0);
-  int WsIdx = 1;
-  int WtIdx = ResTy.getVectorNumElements() + 1;
-
-  for (unsigned i = 0; i < Indices.size(); i += 2) {
-    if (Indices[i] != -1 && Indices[i] != WsIdx)
-      return SDValue();
-    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
-      return SDValue();
-    WsIdx += 2;
-    WtIdx += 2;
-  }
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the odd elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size() + 1, 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the odd elements of one half or the
+  // other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size() + 1, 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
 
-  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Op->getOperand(0),
-                     Op->getOperand(1));
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Wt, Ws);
 }
 
-// Lower VECTOR_SHUFFLE into ILVL (if possible).
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
 //
-// ILVL interleaves consecutive elements from the left half of each vector.
+// ILVR interleaves consecutive elements from the right (lowest-indexed) half of
+// each vector.
 //
-// It is possible to lower into ILVL when the mask takes the form:
-//   <0, n, 1, n+1, 2, n+2, ...>
+// It is possible to lower into ILVR when the mask consists of two of the
+// following forms interleaved:
+//   <0, 1, 2, ...>
+//   <n, n+1, n+2, ...>
 // where n is the number of elements in the vector.
+// For example:
+//   <0, 0, 1, 1, 2, 2, ...>
+//   <0, n, 1, n+1, 2, n+2, ...>
 //
 // When undef's appear in the mask they are treated as if they were whatever
-// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
                                         SmallVector<int, 16> Indices,
                                         SelectionDAG &DAG) {
-  assert ((Indices.size() % 2) == 0);
-  int WsIdx = 0;
-  int WtIdx = ResTy.getVectorNumElements();
-
-  for (unsigned i = 0; i < Indices.size(); i += 2) {
-    if (Indices[i] != -1 && Indices[i] != WsIdx)
-      return SDValue();
-    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
-      return SDValue();
-    WsIdx ++;
-    WtIdx ++;
-  }
+  assert((Indices.size() % 2) == 0);
+
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the right (lowest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size(), 1))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
+
+  // Check odd elements are taken from the right (lowest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size(), 1))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
 
-  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Op->getOperand(0),
-                     Op->getOperand(1));
+  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Ws, Wt);
 }
 
-// Lower VECTOR_SHUFFLE into ILVR (if possible).
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
 //
-// ILVR interleaves consecutive elements from the right half of each vector.
+// ILVL interleaves consecutive elements from the left (highest-indexed) half
+// of each vector.
 //
-// It is possible to lower into ILVR when the mask takes the form:
-//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+// It is possible to lower into ILVL when the mask consists of two of the
+// following forms interleaved:
+//   <x, x+1, x+2, ...>
+//   <n+x, n+x+1, n+x+2, ...>
 // where n is the number of elements in the vector and x is half n.
+// For example:
+//   <x, x, x+1, x+1, x+2, x+2, ...>
+//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
 //
 // When undef's appear in the mask they are treated as if they were whatever
-// value is necessary in order to fit the above form.
-static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
                                         SmallVector<int, 16> Indices,
                                         SelectionDAG &DAG) {
-  assert ((Indices.size() % 2) == 0);
-  unsigned NumElts = ResTy.getVectorNumElements();
-  int WsIdx = NumElts / 2;
-  int WtIdx = NumElts + NumElts / 2;
-
-  for (unsigned i = 0; i < Indices.size(); i += 2) {
-    if (Indices[i] != -1 && Indices[i] != WsIdx)
-      return SDValue();
-    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
-      return SDValue();
-    WsIdx ++;
-    WtIdx ++;
-  }
+  assert((Indices.size() % 2) == 0);
+
+  unsigned HalfSize = Indices.size() / 2;
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &End = Indices.end();
+
+  // Check even elements are taken from the left (highest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 2, End, Indices.size() + HalfSize, 1))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
 
-  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Op->getOperand(0),
-                     Op->getOperand(1));
+  // Check odd elements are taken from the left (highest-indexed) elements of
+  // one half or the other and pick an operand accordingly.
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Indices.size() + HalfSize,
+                                   1))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Ws, Wt);
 }
 
 // Lower VECTOR_SHUFFLE into PCKEV (if possible).
 //
 // PCKEV copies the even elements of each vector into the result vector.
 //
-// It is possible to lower into PCKEV when the mask takes the form:
-//   <0, 2, 4, ..., n, n+2, n+4, ...>
+// It is possible to lower into PCKEV when the mask consists of two of the
+// following forms concatenated:
+//   <0, 2, 4, ...>
+//   <n, n+2, n+4, ...>
 // where n is the number of elements in the vector.
+// For example:
+//   <0, 2, 4, ..., 0, 2, 4, ...>
+//   <0, 2, 4, ..., n, n+2, n+4, ...>
 //
 // When undef's appear in the mask they are treated as if they were whatever
-// value is necessary in order to fit the above form.
+// value is necessary in order to fit the above forms.
 static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
                                          SmallVector<int, 16> Indices,
                                          SelectionDAG &DAG) {
-  assert ((Indices.size() % 2) == 0);
-  int Idx = 0;
+  assert((Indices.size() % 2) == 0);
 
-  for (unsigned i = 0; i < Indices.size(); ++i) {
-    if (Indices[i] != -1 && Indices[i] != Idx)
-      return SDValue();
-    Idx += 2;
-  }
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &Mid = Indices.begin() + Indices.size() / 2;
+  const auto &End = Indices.end();
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Indices.size(), 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
 
-  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Op->getOperand(0),
-                     Op->getOperand(1));
+  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Mid, 1, End, Indices.size(), 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Ws, Wt);
 }
 
 // Lower VECTOR_SHUFFLE into PCKOD (if possible).
 //
 // PCKOD copies the odd elements of each vector into the result vector.
 //
-// It is possible to lower into PCKOD when the mask takes the form:
-//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+// It is possible to lower into PCKOD when the mask consists of two of the
+// following forms concatenated:
+//   <1, 3, 5, ...>
+//   <n+1, n+3, n+5, ...>
 // where n is the number of elements in the vector.
+// For example:
+//   <1, 3, 5, ..., 1, 3, 5, ...>
+//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
 //
 // When undef's appear in the mask they are treated as if they were whatever
-// value is necessary in order to fit the above form.
+// value is necessary in order to fit the above forms.
 static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
                                          SmallVector<int, 16> Indices,
                                          SelectionDAG &DAG) {
-  assert ((Indices.size() % 2) == 0);
-  int Idx = 1;
+  assert((Indices.size() % 2) == 0);
 
-  for (unsigned i = 0; i < Indices.size(); ++i) {
-    if (Indices[i] != -1 && Indices[i] != Idx)
-      return SDValue();
-    Idx += 2;
-  }
+  SDValue Wt;
+  SDValue Ws;
+  const auto &Begin = Indices.begin();
+  const auto &Mid = Indices.begin() + Indices.size() / 2;
+  const auto &End = Indices.end();
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
+    Wt = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Indices.size() + 1, 2))
+    Wt = Op->getOperand(1);
+  else
+    return SDValue();
 
-  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Op->getOperand(0),
-                     Op->getOperand(1));
+  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
+    Ws = Op->getOperand(0);
+  else if (fitsRegularPattern<int>(Mid, 1, End, Indices.size() + 1, 2))
+    Ws = Op->getOperand(1);
+  else
+    return SDValue();
+
+  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Ws, Wt);
 }
 
 // Lower VECTOR_SHUFFLE into VSHF.
@@ -2715,10 +2864,11 @@ SDValue MipsSETargetLowering::lowerVECTO
   for (int i = 0; i < ResTyNumElts; ++i)
     Indices.push_back(Node->getMaskElt(i));
 
-  SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
-  if (Result.getNode())
-    return Result;
-  Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
+  // splati.[bhwd] is preferable to the others but is matched from
+  // MipsISD::VSHF.
+  if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG))
+    return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+  SDValue Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
   if (Result.getNode())
     return Result;
   Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG);
@@ -2736,6 +2886,9 @@ SDValue MipsSETargetLowering::lowerVECTO
   Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG);
   if (Result.getNode())
     return Result;
+  Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
   return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
 }
 

Modified: llvm/trunk/test/CodeGen/Mips/msa/shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Mips/msa/shuffle.ll?rev=237689&r1=237688&r2=237689&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Mips/msa/shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/Mips/msa/shuffle.ll Tue May 19 07:24:52 2015
@@ -2,7 +2,7 @@
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
 
 define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: vshf_v16i8_0:
+  ; CHECK-LABEL: vshf_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -14,11 +14,10 @@ define void @vshf_v16i8_0(<16 x i8>* %c,
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v16i8_0
 }
 
 define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: vshf_v16i8_1:
+  ; CHECK-LABEL: vshf_v16i8_1:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -28,11 +27,10 @@ define void @vshf_v16i8_1(<16 x i8>* %c,
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v16i8_1
 }
 
 define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: vshf_v16i8_2:
+  ; CHECK-LABEL: vshf_v16i8_2:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   %2 = load <16 x i8>, <16 x i8>* %b
@@ -45,11 +43,10 @@ define void @vshf_v16i8_2(<16 x i8>* %c,
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v16i8_2
 }
 
 define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: vshf_v16i8_3:
+  ; CHECK-LABEL: vshf_v16i8_3:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -65,11 +62,10 @@ define void @vshf_v16i8_3(<16 x i8>* %c,
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v16i8_3
 }
 
 define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: vshf_v16i8_4:
+  ; CHECK-LABEL: vshf_v16i8_4:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -79,11 +75,10 @@ define void @vshf_v16i8_4(<16 x i8>* %c,
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v16i8_4
 }
 
 define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: vshf_v8i16_0:
+  ; CHECK-LABEL: vshf_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -95,11 +90,10 @@ define void @vshf_v8i16_0(<8 x i16>* %c,
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v8i16_0
 }
 
 define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: vshf_v8i16_1:
+  ; CHECK-LABEL: vshf_v8i16_1:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -109,11 +103,10 @@ define void @vshf_v8i16_1(<8 x i16>* %c,
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v8i16_1
 }
 
 define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: vshf_v8i16_2:
+  ; CHECK-LABEL: vshf_v8i16_2:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   %2 = load <8 x i16>, <8 x i16>* %b
@@ -126,11 +119,10 @@ define void @vshf_v8i16_2(<8 x i16>* %c,
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v8i16_2
 }
 
 define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: vshf_v8i16_3:
+  ; CHECK-LABEL: vshf_v8i16_3:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -146,11 +138,10 @@ define void @vshf_v8i16_3(<8 x i16>* %c,
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v8i16_3
 }
 
 define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: vshf_v8i16_4:
+  ; CHECK-LABEL: vshf_v8i16_4:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -160,14 +151,13 @@ define void @vshf_v8i16_4(<8 x i16>* %c,
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v8i16_4
 }
 
 ; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w
 ; instruction when using a single vector.
 
 define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: vshf_v4i32_0:
+  ; CHECK-LABEL: vshf_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
@@ -177,25 +167,23 @@ define void @vshf_v4i32_0(<4 x i32>* %c,
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v4i32_0
 }
 
 define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: vshf_v4i32_1:
+  ; CHECK-LABEL: vshf_v4i32_1:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
+  ; CHECK-DAG: splati.w [[R3:\$w[0-9]+]], [[R1]][1]
   store <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v4i32_1
 }
 
 define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: vshf_v4i32_2:
+  ; CHECK-LABEL: vshf_v4i32_2:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   %2 = load <4 x i32>, <4 x i32>* %b
@@ -206,11 +194,10 @@ define void @vshf_v4i32_2(<4 x i32>* %c,
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v4i32_2
 }
 
 define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: vshf_v4i32_3:
+  ; CHECK-LABEL: vshf_v4i32_3:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
@@ -226,25 +213,24 @@ define void @vshf_v4i32_3(<4 x i32>* %c,
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v4i32_3
 }
 
 define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: vshf_v4i32_4:
+  ; CHECK-LABEL: vshf_v4i32_4:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 5, i32 5, i32 1>
-  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
+  ; The two operand vectors are the same so element 1 and 5 are equivalent.
+  ; CHECK-DAG: splati.w [[R3:\$w[0-9]+]], [[R1]][1]
   store <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v4i32_4
 }
 
 define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: vshf_v2i64_0:
+  ; CHECK-LABEL: vshf_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -256,11 +242,10 @@ define void @vshf_v2i64_0(<2 x i64>* %c,
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v2i64_0
 }
 
 define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: vshf_v2i64_1:
+  ; CHECK-LABEL: vshf_v2i64_1:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -270,11 +255,10 @@ define void @vshf_v2i64_1(<2 x i64>* %c,
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v2i64_1
 }
 
 define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: vshf_v2i64_2:
+  ; CHECK-LABEL: vshf_v2i64_2:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   %2 = load <2 x i64>, <2 x i64>* %b
@@ -287,11 +271,10 @@ define void @vshf_v2i64_2(<2 x i64>* %c,
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v2i64_2
 }
 
 define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: vshf_v2i64_3:
+  ; CHECK-LABEL: vshf_v2i64_3:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -307,11 +290,10 @@ define void @vshf_v2i64_3(<2 x i64>* %c,
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v2i64_3
 }
 
 define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: vshf_v2i64_4:
+  ; CHECK-LABEL: vshf_v2i64_4:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -321,11 +303,10 @@ define void @vshf_v2i64_4(<2 x i64>* %c,
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size vshf_v2i64_4
 }
 
 define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: shf_v16i8_0:
+  ; CHECK-LABEL: shf_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -335,11 +316,10 @@ define void @shf_v16i8_0(<16 x i8>* %c,
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size shf_v16i8_0
 }
 
 define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: shf_v8i16_0:
+  ; CHECK-LABEL: shf_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -349,11 +329,10 @@ define void @shf_v8i16_0(<8 x i16>* %c,
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size shf_v8i16_0
 }
 
 define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: shf_v4i32_0:
+  ; CHECK-LABEL: shf_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
@@ -363,13 +342,12 @@ define void @shf_v4i32_0(<4 x i32>* %c,
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size shf_v4i32_0
 }
 
 ; shf.d does not exist
 
 define void @ilvev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: ilvev_v16i8_0:
+  ; CHECK-LABEL: ilvev_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -377,64 +355,177 @@ define void @ilvev_v16i8_0(<16 x i8>* %c
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
                      <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvev_v16i8_0
 }
 
 define void @ilvev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: ilvev_v8i16_0:
+  ; CHECK-LABEL: ilvev_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
   %2 = load <8 x i16>, <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvev_v8i16_0
 }
 
 define void @ilvev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: ilvev_v4i32_0:
+  ; CHECK-LABEL: ilvev_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x i32>, <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvev_v4i32_0
 }
 
 define void @ilvev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: ilvev_v2i64_0:
+  ; CHECK-LABEL: ilvev_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x i64>, <2 x i64>* %b
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+; Interleaving one operand with itself.
+define void @ilvev_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v16i8_1:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v8i16_1:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v4i32_1:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>, <4 x i32>* %b
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v2i64_1:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>, <2 x i64>* %b
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 0>
+  ; ilvev.d with two identical operands is equivalent to splati.d
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][0]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v16i8_2:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
+  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v8i16_2:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v4i32_2:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = load <4 x i32>, <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 4, i32 6, i32 6>
+  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvev_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvev_v2i64_2:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  %2 = load <2 x i64>, <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 2, i32 2>
+  ; ilvev.d with two identical operands is equivalent to splati.d
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][0]
   store <2 x i64> %3, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvev_v2i64_0
 }
 
 define void @ilvod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: ilvod_v16i8_0:
+  ; CHECK-LABEL: ilvod_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -447,11 +538,10 @@ define void @ilvod_v16i8_0(<16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvod_v16i8_0
 }
 
 define void @ilvod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: ilvod_v8i16_0:
+  ; CHECK-LABEL: ilvod_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -463,11 +553,10 @@ define void @ilvod_v8i16_0(<8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvod_v8i16_0
 }
 
 define void @ilvod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: ilvod_v4i32_0:
+  ; CHECK-LABEL: ilvod_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
@@ -479,11 +568,10 @@ define void @ilvod_v4i32_0(<4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvod_v4i32_0
 }
 
 define void @ilvod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: ilvod_v2i64_0:
+  ; CHECK-LABEL: ilvod_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -495,11 +583,126 @@ define void @ilvod_v2i64_0(<2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvod_v2i64_0
 }
 
-define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: ilvl_v16i8_0:
+define void @ilvod_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v16i8_1:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v8i16_1:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v4i32_1:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>, <4 x i32>* %b
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v2i64_1:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>, <2 x i64>* %b
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 1>
+  ; ilvod.d with two identical operands is equivalent to splati.d
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v16i8_2:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31>
+  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v8i16_2:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v4i32_2:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = load <4 x i32>, <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
+  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvod_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvod_v2i64_2:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  %2 = load <2 x i64>, <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 3>
+  ; ilvod.d with two identical operands is equivalent to splati.d
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][1]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -507,65 +710,177 @@ define void @ilvl_v16i8_0(<16 x i8>* %c,
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
                      <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvl_v16i8_0
 }
 
-define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: ilvl_v8i16_0:
+define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
   %2 = load <8 x i16>, <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvl_v8i16_0
 }
 
-define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: ilvl_v4i32_0:
+define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x i32>, <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvl_v4i32_0
 }
 
-define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: ilvl_v2i64_0:
+define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x i64>, <2 x i64>* %b
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  ; ilvl.d and ilvev.d are equivalent for v2i64
-  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; ilvr.d and ilvev.d are equivalent for v2i64
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %3, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvl_v2i64_0
 }
 
-define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: ilvr_v16i8_0:
+define void @ilvr_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v16i8_1:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23>
+  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v8i16_1:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v4i32_1:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = load <4 x i32>, <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 4, i32 5, i32 5>
+  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v2i64_1:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  %2 = load <2 x i64>, <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 2, i32 2>
+  ; ilvr.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][0]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v16i8_2:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v8i16_2:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v4i32_2:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>, <4 x i32>* %b
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvr_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvr_v2i64_2:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>, <2 x i64>* %b
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 0>
+  ; ilvr.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][0]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -573,65 +888,177 @@ define void @ilvr_v16i8_0(<16 x i8>* %c,
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
                      <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvr_v16i8_0
 }
 
-define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: ilvr_v8i16_0:
+define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
   %2 = load <8 x i16>, <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvr_v8i16_0
 }
 
-define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: ilvr_v4i32_0:
+define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x i32>, <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvr_v4i32_0
 }
 
-define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: ilvr_v2i64_0:
+define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = load <2 x i64>, <2 x i64>* %b
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
-  ; ilvr.d and ilvod.d are equivalent for v2i64
+  ; ilvl.d and ilvod.d are equivalent for v2i64
   ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
   store <2 x i64> %3, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size ilvr_v2i64_0
+}
+
+define void @ilvl_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v16i8_1:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
+  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v8i16_1:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v4i32_1:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = load <4 x i32>, <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 6, i32 6, i32 7, i32 7>
+  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v2i64_1:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  %2 = load <2 x i64>, <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 3>
+  ; ilvl.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][1]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v16i8_2:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v8i16_2:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v4i32_2:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>, <4 x i32>* %b
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @ilvl_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: ilvl_v2i64_2:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>, <2 x i64>* %b
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 1>
+  ; ilvl.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
 }
 
 define void @pckev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: pckev_v16i8_0:
+  ; CHECK-LABEL: pckev_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -639,48 +1066,45 @@ define void @pckev_v16i8_0(<16 x i8>* %c
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
                      <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckev_v16i8_0
 }
 
 define void @pckev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: pckev_v8i16_0:
+  ; CHECK-LABEL: pckev_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
   %2 = load <8 x i16>, <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckev_v8i16_0
 }
 
 define void @pckev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: pckev_v4i32_0:
+  ; CHECK-LABEL: pckev_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x i32>, <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckev_v4i32_0
 }
 
 define void @pckev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: pckev_v2i64_0:
+  ; CHECK-LABEL: pckev_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -688,16 +1112,131 @@ define void @pckev_v2i64_0(<2 x i64>* %c
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
   ; pckev.d and ilvev.d are equivalent for v2i64
-  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v16i8_1:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v8i16_1:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 8, i32 10, i32 12, i32 14>
+  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v4i32_1:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = load <4 x i32>, <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 6, i32 4, i32 6>
+  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v2i64_1:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  %2 = load <2 x i64>, <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 2, i32 2>
+  ; pckev.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][0]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v16i8_2:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v8i16_2:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 0, i32 2, i32 4, i32 6>
+  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v4i32_2:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>, <4 x i32>* %b
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckev_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: pckev_v2i64_2:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>, <2 x i64>* %b
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 0>
+  ; pckev.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][0]
   store <2 x i64> %3, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckev_v2i64_0
 }
 
 define void @pckod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
-  ; CHECK: pckod_v16i8_0:
+  ; CHECK-LABEL: pckod_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -705,48 +1244,45 @@ define void @pckod_v16i8_0(<16 x i8>* %c
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
                      <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <16 x i8> %3, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckod_v16i8_0
 }
 
 define void @pckod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
-  ; CHECK: pckod_v8i16_0:
+  ; CHECK-LABEL: pckod_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
   %2 = load <8 x i16>, <8 x i16>* %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <8 x i16> %3, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckod_v8i16_0
 }
 
 define void @pckod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
-  ; CHECK: pckod_v4i32_0:
+  ; CHECK-LABEL: pckod_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = load <4 x i32>, <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckod_v4i32_0
 }
 
 define void @pckod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
-  ; CHECK: pckod_v2i64_0:
+  ; CHECK-LABEL: pckod_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -759,11 +1295,126 @@ define void @pckod_v2i64_0(<2 x i64>* %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size pckod_v2i64_0
+}
+
+define void @pckod_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v16i8_1:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  %2 = load <16 x i8>, <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v8i16_1:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  %2 = load <8 x i16>, <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 9, i32 11, i32 13, i32 15>
+  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v4i32_1:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  %2 = load <4 x i32>, <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 5, i32 7, i32 5, i32 7>
+  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v2i64_1:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  %2 = load <2 x i64>, <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 3>
+  ; pckod.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][1]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v16i8_2:
+
+  %1 = load <16 x i8>, <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>, <16 x i8>* %b
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v8i16_2:
+
+  %1 = load <8 x i16>, <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>, <8 x i16>* %b
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 1, i32 3, i32 5, i32 7>
+  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v4i32_2:
+
+  %1 = load <4 x i32>, <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>, <4 x i32>* %b
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+}
+
+define void @pckod_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK-LABEL: pckod_v2i64_2:
+
+  %1 = load <2 x i64>, <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>, <2 x i64>* %b
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 1>
+  ; pckod.d and splati.d are equivalent for v2i64
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
 }
 
 define void @splati_v16i8_0(<16 x i8>* %c, <16 x i8>* %a) nounwind {
-  ; CHECK: splati_v16i8_0:
+  ; CHECK-LABEL: splati_v16i8_0:
 
   %1 = load <16 x i8>, <16 x i8>* %a
   ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
@@ -774,11 +1425,10 @@ define void @splati_v16i8_0(<16 x i8>* %
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size splati_v16i8_0
 }
 
 define void @splati_v8i16_0(<8 x i16>* %c, <8 x i16>* %a) nounwind {
-  ; CHECK: splati_v8i16_0:
+  ; CHECK-LABEL: splati_v8i16_0:
 
   %1 = load <8 x i16>, <8 x i16>* %a
   ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
@@ -788,26 +1438,23 @@ define void @splati_v8i16_0(<8 x i16>* %
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size splati_v8i16_0
 }
 
 define void @splati_v4i32_0(<4 x i32>* %c, <4 x i32>* %a) nounwind {
-  ; CHECK: splati_v4i32_0:
+  ; CHECK-LABEL: splati_v4i32_0:
 
   %1 = load <4 x i32>, <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  ; shf.w and splati.w are equivalent
-  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 255
+  ; CHECK-DAG: splati.w [[R3:\$w[0-9]+]], [[R1]][3]
   store <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size splati_v4i32_0
 }
 
 define void @splati_v2i64_0(<2 x i64>* %c, <2 x i64>* %a) nounwind {
-  ; CHECK: splati_v2i64_0:
+  ; CHECK-LABEL: splati_v2i64_0:
 
   %1 = load <2 x i64>, <2 x i64>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
@@ -817,5 +1464,4 @@ define void @splati_v2i64_0(<2 x i64>* %
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
   ret void
-  ; CHECK: .size splati_v2i64_0
 }