[llvm] r317111 - Adds code to PPC ISEL lowering to recognize half-word inserts from vector_shuffles, and use P9 shift and vector insert instructions instead of vperm.

Wed Nov 1 11:06:56 PDT 2017

Author: gyiu
Date: Wed Nov  1 11:06:56 2017
New Revision: 317111

URL: http://llvm.org/viewvc/llvm-project?rev=317111&view=rev
Log:
Adds code to PPC ISEL lowering to recognize half-word inserts from vector_shuffles, and use P9 shift and vector insert instructions instead of vperm.

Differential Revision: https://reviews.llvm.org/D34160

Added:
    llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
Modified:
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
    llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=317111&r1=317110&r2=317111&view=diff
==============================================================================

--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Wed Nov  1 11:06:56 2017
@@ -114,6 +114,8 @@ cl::desc("disable sibling call optimizat
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
 
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
+
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
@@ -7886,6 +7888,118 @@ static SDValue GeneratePerfectShuffle(un
   return DAG.getNode(ISD::BITCAST, dl, VT, T);
 }
 
+/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
+/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
+/// SDValue.
+SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
+                                           SelectionDAG &DAG) const {
+  const unsigned NumHalfWords = 8;
+  const unsigned BytesInVector = NumHalfWords * 2;
+  // Check that the shuffle is on half-words.
+  if (!isNByteElemShuffleMask(N, 2, 1))
+    return SDValue();
+
+  bool IsLE = Subtarget.isLittleEndian();
+  SDLoc dl(N);
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned ShiftElts = 0, InsertAtByte = 0;
+  bool Swap = false;
+
+  // Shifts required to get the half-word we want at element 3.
+  unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
+  unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
+
+  uint32_t Mask = 0;
+  uint32_t OriginalOrderLow = 0x1234567;
+  uint32_t OriginalOrderHigh = 0x89ABCDEF;
+  // Now we look at mask elements 0,2,4,6,8,10,12,14.  Pack the mask into a
+  // 32-bit space, only need 4-bit nibbles per element.
+  for (unsigned i = 0; i < NumHalfWords; ++i) {
+    unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+    Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
+  }
+
+  // For each mask element, find out if we're just inserting something
+  // from V2 into V1 or vice versa.  Possible permutations inserting an element
+  // from V2 into V1:
+  //   X, 1, 2, 3, 4, 5, 6, 7
+  //   0, X, 2, 3, 4, 5, 6, 7
+  //   0, 1, X, 3, 4, 5, 6, 7
+  //   0, 1, 2, X, 4, 5, 6, 7
+  //   0, 1, 2, 3, X, 5, 6, 7
+  //   0, 1, 2, 3, 4, X, 6, 7
+  //   0, 1, 2, 3, 4, 5, X, 7
+  //   0, 1, 2, 3, 4, 5, 6, X
+  // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
+
+  bool FoundCandidate = false;
+  // Go through the mask of half-words to find an element that's being moved
+  // from one vector to the other.
+  for (unsigned i = 0; i < NumHalfWords; ++i) {
+    unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
+    uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
+    uint32_t MaskOtherElts = ~(0xF << MaskShift);
+    uint32_t TargetOrder = 0x0;
+
+    // If both vector operands for the shuffle are the same vector, the mask
+    // will contain only elements from the first one and the second one will be
+    // undef.
+    if (V2.isUndef()) {
+      ShiftElts = 0;
+      unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
+      TargetOrder = OriginalOrderLow;
+      Swap = false;
+      // Skip if not the correct element or mask of other elements don't equal
+      // to our expected order.
+      if (MaskOneElt == VINSERTHSrcElem &&
+          (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+        InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+        FoundCandidate = true;
+        break;
+      }
+    } else { // If both operands are defined.
+      // Target order is [8,15] if the current mask is between [0,7].
+      TargetOrder =
+          (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
+      // Skip if mask of other elements don't equal our expected order.
+      if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
+        // We only need the last 3 bits for the number of shifts.
+        ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
+                         : BigEndianShifts[MaskOneElt & 0x7];
+        InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
+        Swap = MaskOneElt < NumHalfWords;
+        FoundCandidate = true;
+        break;
+      }
+    }
+  }
+
+  if (!FoundCandidate)
+    return SDValue();
+
+  // Candidate found, construct the proper SDAG sequence with VINSERTH,
+  // optionally with VECSHL if shift is required.
+  if (Swap)
+    std::swap(V1, V2);
+  if (V2.isUndef())
+    V2 = V1;
+  SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  if (ShiftElts) {
+    // Double ShiftElts because we're left shifting on v16i8 type.
+    SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
+                              DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
+    SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
+    SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+                              DAG.getConstant(InsertAtByte, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+  }
+  SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+  SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
+                            DAG.getConstant(InsertAtByte, dl, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
+}
+
 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
 /// is a shuffle we can handle in a single instruction, return it.  Otherwise,
 /// return the code it can be lowered into.  Worst case, it can always be
@@ -7920,6 +8034,11 @@ SDValue PPCTargetLowering::LowerVECTOR_S
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
+  if (Subtarget.hasP9Altivec()) {
+    SDValue NewISDNode = lowerToVINSERTH(SVOp, DAG);
+    if (NewISDNode)
+      return NewISDNode;
+  }
 
   if (Subtarget.hasVSX() &&
       PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h?rev=317111&r1=317110&r2=317111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h Wed Nov  1 11:06:56 2017
@@ -1072,7 +1072,14 @@ namespace llvm {
     SDValue
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
-  };
+
+    /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
+    /// handled by the VINSERTH instruction introduced in ISA 3.0. This is
+    /// essentially any shuffle of v8i16 vectors that just inserts one element
+    /// from one vector into the other.
+    SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
+  }; // end class PPCTargetLowering
 
   namespace PPC {
 

Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td?rev=317111&r1=317110&r2=317111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td Wed Nov  1 11:06:56 2017
@@ -477,10 +477,10 @@ def VPERM      : VA1a_Int_Ty3<43, "vperm
 def VSEL       : VA1a_Int_Ty<42, "vsel",  int_ppc_altivec_vsel, v4i32>;
 
 // Shuffles.
-def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH),
+def VSLDOI  : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u4imm:$SH),
                        "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP,
-                       [(set v16i8:$vD, 
-                         (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+                       [(set v16i8:$vD,
+                         (PPCvecshl v16i8:$vA, v16i8:$vB, imm32SExt16:$SH))]>;
 
 // VX-Form instructions.  AltiVec arithmetic ops.
 let isCommutable = 1 in {
@@ -908,6 +908,9 @@ def:Pat<(vpkuwum_unary_shuffle v16i8:$vA
         (VPKUWUM $vA, $vA)>;
 def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
         (VPKUHUM $vA, $vA)>;
+def:Pat<(vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB),
+        (VSLDOI v16i8:$vA, v16i8:$vB, (VSLDOI_get_imm $SH))>;
+
 
 // Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
 // These fragments are matched for little-endian, where the inputs must
@@ -1310,7 +1313,12 @@ def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "ve
 
 // Vector Insert Element Instructions
 def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>;
-def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>;
+def VINSERTH : VXForm_1<845, (outs vrrc:$vD),
+                        (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB),
+                        "vinserth $vD, $vB, $UIM", IIC_VecGeneral,
+                        [(set v8i16:$vD, (PPCvecinsert v8i16:$vDi, v8i16:$vB,
+                                                      imm32SExt16:$UIM))]>,
+                        RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
 def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>;
 def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
 

Added: llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll?rev=317111&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll Wed Nov  1 11:06:56 2017
@@ -0,0 +1,300 @@
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O0 -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BE
+
+; The following testcases take one halfword element from the second vector and
+; inserts it at various locations in the first vector
+define <8 x i16> @shuffle_vector_halfword_0_8(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_0_8
+; CHECK: vsldoi 3, 3, 3, 8
+; CHECK: vinserth 2, 3, 14
+; CHECK-BE-LABEL: shuffle_vector_halfword_0_8
+; CHECK-BE: vsldoi 3, 3, 3, 10
+; CHECK-BE: vinserth 2, 3, 0
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_1_15(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_1_15
+; CHECK: vsldoi 3, 3, 3, 10
+; CHECK: vinserth 2, 3, 12
+; CHECK-BE-LABEL: shuffle_vector_halfword_1_15
+; CHECK-BE: vsldoi 3, 3, 3, 8
+; CHECK-BE: vinserth 2, 3, 2
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 15, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_2_9(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_2_9
+; CHECK: vsldoi 3, 3, 3, 6
+; CHECK: vinserth 2, 3, 10
+; CHECK-BE-LABEL: shuffle_vector_halfword_2_9
+; CHECK-BE: vsldoi 3, 3, 3, 12
+; CHECK-BE: vinserth 2, 3, 4
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_3_13(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_3_13
+; CHECK: vsldoi 3, 3, 3, 14
+; CHECK: vinserth 2, 3, 8
+; CHECK-BE-LABEL: shuffle_vector_halfword_3_13
+; CHECK-BE: vsldoi 3, 3, 3, 4
+; CHECK-BE: vinserth 2, 3, 6
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_4_10(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_4_10
+; CHECK: vsldoi 3, 3, 3, 4
+; CHECK: vinserth 2, 3, 6
+; CHECK-BE-LABEL: shuffle_vector_halfword_4_10
+; CHECK-BE: vsldoi 3, 3, 3, 14
+; CHECK-BE: vinserth 2, 3, 8
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_5_14(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_5_14
+; CHECK: vsldoi 3, 3, 3, 12
+; CHECK: vinserth 2, 3, 4
+; CHECK-BE-LABEL: shuffle_vector_halfword_5_14
+; CHECK-BE: vsldoi 3, 3, 3, 6
+; CHECK-BE: vinserth 2, 3, 10
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 14, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_6_11(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_6_11
+; CHECK: vsldoi 3, 3, 3, 2
+; CHECK: vinserth 2, 3, 2
+; CHECK-BE-LABEL: shuffle_vector_halfword_6_11
+; CHECK-BE: vinserth 2, 3, 12
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 11, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_7_12(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_7_12
+; CHECK: vinserth 2, 3, 0
+; CHECK-BE-LABEL: shuffle_vector_halfword_7_12
+; CHECK-BE: vsldoi 3, 3, 3, 2
+; CHECK-BE: vinserth 2, 3, 14
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_8_1(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_8_1
+; CHECK: vsldoi 2, 2, 2, 6
+; CHECK: vinserth 3, 2, 14
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_8_1
+; CHECK-BE: vsldoi 2, 2, 2, 12
+; CHECK-BE: vinserth 3, 2, 0
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+; The following testcases take one halfword element from the first vector and
+; inserts it at various locations in the second vector
+define <8 x i16> @shuffle_vector_halfword_9_7(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_9_7
+; CHECK: vsldoi 2, 2, 2, 10
+; CHECK: vinserth 3, 2, 12
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_9_7
+; CHECK-BE: vsldoi 2, 2, 2, 8
+; CHECK-BE: vinserth 3, 2, 2
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 7, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_10_4(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_10_4
+; CHECK: vinserth 3, 2, 10
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_10_4
+; CHECK-BE: vsldoi 2, 2, 2, 2
+; CHECK-BE: vinserth 3, 2, 4
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 4, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_11_2(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_11_2
+; CHECK: vsldoi 2, 2, 2, 4
+; CHECK: vinserth 3, 2, 8
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_11_2
+; CHECK-BE: vsldoi 2, 2, 2, 14
+; CHECK-BE: vinserth 3, 2, 6
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 2, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_12_6(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_12_6
+; CHECK: vsldoi 2, 2, 2, 12
+; CHECK: vinserth 3, 2, 6
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_12_6
+; CHECK-BE: vsldoi 2, 2, 2, 6
+; CHECK-BE: vinserth 3, 2, 8
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 6, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_13_3(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_13_3
+; CHECK: vsldoi 2, 2, 2, 2
+; CHECK: vinserth 3, 2, 4
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_13_3
+; CHECK-BE: vinserth 3, 2, 10
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 3, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_14_5(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_14_5
+; CHECK: vsldoi 2, 2, 2, 14
+; CHECK: vinserth 3, 2, 2
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_14_5
+; CHECK-BE: vsldoi 2, 2, 2, 4
+; CHECK-BE: vinserth 3, 2, 12
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 5, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_15_0(<8 x i16> %a, <8 x i16> %b) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_15_0
+; CHECK: vsldoi 2, 2, 2, 8
+; CHECK: vinserth 3, 2, 0
+; CHECK: vmr 2, 3
+; CHECK-BE-LABEL: shuffle_vector_halfword_15_0
+; CHECK-BE: vsldoi 2, 2, 2, 10
+; CHECK-BE: vinserth 3, 2, 14
+; CHECK-BE: vmr 2, 3
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i16> %vecins
+}
+
+; The following testcases use the same vector in both arguments of the
+; shufflevector.  If halfword element 3 in BE mode(or 4 in LE mode) is the one
+; we're attempting to insert, then we can use the vector insert instruction
+define <8 x i16> @shuffle_vector_halfword_0_4(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_0_4
+; CHECK: vinserth 2, 2, 14
+; CHECK-BE-LABEL: shuffle_vector_halfword_0_4
+; CHECK-BE-NOT: vinserth
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_1_3(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_1_3
+; CHECK-NOT: vinserth
+; CHECK-BE-LABEL: shuffle_vector_halfword_1_3
+; CHECK-BE: vinserth 2, 2, 2
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 3, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_2_3(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_2_3
+; CHECK-NOT: vinserth
+; CHECK-BE-LABEL: shuffle_vector_halfword_2_3
+; CHECK-BE: vinserth 2, 2, 4
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_3_4(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_3_4
+; CHECK: vinserth 2, 2, 8
+; CHECK-BE-LABEL: shuffle_vector_halfword_3_4
+; CHECK-BE-NOT: vinserth
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_4_3(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_4_3
+; CHECK-NOT: vinserth
+; CHECK-BE-LABEL: shuffle_vector_halfword_4_3
+; CHECK-BE: vinserth 2, 2, 8
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_5_3(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_5_3
+; CHECK-NOT: vinserth
+; CHECK-BE-LABEL: shuffle_vector_halfword_5_3
+; CHECK-BE: vinserth 2, 2, 10
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 3, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_6_4(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_6_4
+; CHECK: vinserth 2, 2, 2
+; CHECK-BE-LABEL: shuffle_vector_halfword_6_4
+; CHECK-BE-NOT: vinserth
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 4, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_7_4(<8 x i16> %a) {
+entry:
+; CHECK-LABEL: shuffle_vector_halfword_7_4
+; CHECK: vinserth 2, 2, 0
+; CHECK-BE-LABEL: shuffle_vector_halfword_7_4
+; CHECK-BE-NOT: vinserth
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>
+  ret <8 x i16> %vecins
+}
+