[llvm] f818ec9 - [AIX] Allow safe for 32bit P9 VSX extract and insert pattern matches

Zarko Todorovski via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 27 04:27:49 PDT 2021


Author: Zarko Todorovski
Date: 2021-04-27T07:27:43-04:00
New Revision: f818ec9dd173bfacd0ee8b403146c26e17271a46

URL: https://github.com/llvm/llvm-project/commit/f818ec9dd173bfacd0ee8b403146c26e17271a46
DIFF: https://github.com/llvm/llvm-project/commit/f818ec9dd173bfacd0ee8b403146c26e17271a46.diff

LOG: [AIX] Allow safe for 32bit P9 VSX extract and insert pattern matches

In https://reviews.llvm.org/D92789 PPC64 checks were added that disallowed most
VSX pattern matching.  We enable some safe ones for 32bit in this patch.

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D97503

Added: 
    llvm/test/CodeGen/PowerPC/aix-insert-extract.ll
    llvm/test/CodeGen/PowerPC/aix-p9-insert-extract.ll
    llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
    llvm/test/CodeGen/PowerPC/aix-vec_extract_p9.ll
    llvm/test/CodeGen/PowerPC/aix-vec_extract_p9_2.ll
    llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCInstrVSX.td
    llvm/test/CodeGen/PowerPC/vec-bswap.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 37e156808f6e4..4ccabb05b7796 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -408,7 +408,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // to speed up scalar BSWAP64.
   // CTPOP or CTTZ were introduced in P8/P9 respectively
   setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
-  if (Subtarget.hasP9Vector())
+  if (Subtarget.hasP9Vector() && Subtarget.isPPC64())
     setOperationAction(ISD::BSWAP, MVT::i64  , Custom);
   else
     setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
@@ -10254,6 +10254,8 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 // Lower scalar BSWAP64 to xxbrd.
 SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
+  if (!Subtarget.isPPC64())
+    return Op;
   // MTVSRDD
   Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
                    Op.getOperand(0));

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 16f5bac501033..9b87efdf211dd 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -4067,10 +4067,7 @@ defm : ScalToVecWPermute<
   v8i16, ScalarLoads.Li16,
   (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
   (SUBREG_TO_REG (i64 1), (LXSIHZX xoaddr:$src), sub_64)>;
-} // HasVSX, HasP9Vector, NoP10Vector
 
-// Big endian 64Bit Power9 subtarget.
-let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
 def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
           (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
 def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
@@ -4155,7 +4152,10 @@ def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
           (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
 def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
           (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+} // HasVSX, HasP9Vector, IsBigEndian
 
+// Big endian 64Bit Power9 subtarget.
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
 def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
           (v2i64 (SUBREG_TO_REG (i64 1), (DFLOADf64 iaddrX4:$src), sub_64))>;
 def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),

diff  --git a/llvm/test/CodeGen/PowerPC/aix-insert-extract.ll b/llvm/test/CodeGen/PowerPC/aix-insert-extract.ll
new file mode 100644
index 0000000000000..c3ee88cc31a0b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-insert-extract.ll
@@ -0,0 +1,808 @@
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc-ibm-aix-xcoff -vec-extabi \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-32
+
+define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define float @_Z13testUiToFpExtILj0EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: _Z13testUiToFpExtILj0EEfDv4_j
+; CHECK-64: xxextractuw 0, 34, 0
+; CHECK-64: xscvuxdsp 1, 0
+; CHECK-32-LABEL: _Z13testUiToFpExtILj0EEfDv4_j
+; CHECK-32: lfiwzx 0, 0, 3
+; CHECK-32: xscvuxdsp 1, 0
+  %vecext = extractelement <4 x i32> %a, i32 0
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj1EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: _Z13testUiToFpExtILj1EEfDv4_j
+; CHECK-64: xxextractuw 0, 34, 4
+; CHECK-64: xscvuxdsp 1, 0
+; CHECK-32-LABEL: _Z13testUiToFpExtILj1EEfDv4_j
+; CHECK-32: lfiwzx 0, 0, 3
+; CHECK-32: xscvuxdsp 1, 0
+  %vecext = extractelement <4 x i32> %a, i32 1
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj2EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: _Z13testUiToFpExtILj2EEfDv4_j
+; CHECK-64: xxextractuw 0, 34, 8
+; CHECK-64: xscvuxdsp 1, 0
+; CHECK-32-LABEL: _Z13testUiToFpExtILj2EEfDv4_j
+; CHECK-32: lfiwzx 0, 0, 3
+; CHECK-32: xscvuxdsp 1, 0
+  %vecext = extractelement <4 x i32> %a, i32 2
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj3EEfDv4_j(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: _Z13testUiToFpExtILj3EEfDv4_j
+; CHECK-64: xxextractuw 0, 34, 12
+; CHECK-64: xscvuxdsp 1, 0
+; CHECK-32-LABEL: _Z13testUiToFpExtILj3EEfDv4_j
+; CHECK-32: lfiwzx 0, 0, 3
+; CHECK-32: xscvuxdsp 1, 0
+  %vecext = extractelement <4 x i32> %a, i32 3
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+; Verify we generate optimal code for unsigned vector int elem extract followed
+; by conversion to double
+
+define double @conv2dlbTestui0(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: conv2dlbTestui0
+; CHECK-64: xxextractuw [[CP64:[0-9]+]], 34, 0
+; CHECK-64: xscvuxddp 1, [[CP64]]
+; CHECK-32-LABEL: conv2dlbTestui0
+; CHECK-32: lfiwzx [[CP32:[0-9]+]], 0, 3
+; CHECK-32: xscvuxddp 1, [[CP32]]
+  %0 = extractelement <4 x i32> %a, i32 0
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+define double @conv2dlbTestui1(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: conv2dlbTestui1
+; CHECK-64: xxextractuw [[CP64:[0-9]+]], 34, 4
+; CHECK-64: xscvuxddp 1, [[CP64]]
+; CHECK-32-LABEL: conv2dlbTestui1
+; CHECK-32: lfiwzx [[CP32:[0-9]+]], 0, 3
+; CHECK-32: xscvuxddp 1, [[CP32]]
+  %0 = extractelement <4 x i32> %a, i32 1
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+define double @conv2dlbTestui2(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: conv2dlbTestui2
+; CHECK-64: xxextractuw [[CP64:[0-9]+]], 34, 8
+; CHECK-64: xscvuxddp 1, [[CP64]]
+; CHECK-32-LABEL: conv2dlbTestui2
+; CHECK-32: lfiwzx [[CP32:[0-9]+]], 0, 3
+; CHECK-32: xscvuxddp 1, [[CP32]]
+  %0 = extractelement <4 x i32> %a, i32 2
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+define double @conv2dlbTestui3(<4 x i32> %a) {
+entry:
+; CHECK-64-LABEL: conv2dlbTestui3
+; CHECK-64: xxextractuw [[CP64:[0-9]+]], 34, 12
+; CHECK-64: xscvuxddp 1, [[CP64]]
+; CHECK-32-LABEL: conv2dlbTestui3
+; CHECK-32: lfiwzx [[CP32:[0-9]+]], 0, 3
+; CHECK-32: xscvuxddp 1, [[CP32]]
+  %0 = extractelement <4 x i32> %a, i32 3
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+; verify we don't crash for variable elem extract
+define double @conv2dlbTestuiVar(<4 x i32> %a, i32 zeroext %elem) {
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %elem
+  %conv = uitofp i32 %vecext to double
+  ret double %conv
+}
+
+define <4 x float> @_Z10testInsEltILj0EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj0EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = insertelement <4 x float> %a, float %b, i32 0
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj1EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj1EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = insertelement <4 x float> %a, float %b, i32 1
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj2EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj2EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = insertelement <4 x float> %a, float %b, i32 2
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj3EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj3EDv4_ffET0_S1_T1_
+; CHECK: xscvdpspn 0, 1
+; CHECK: xxsldwi 0, 0, 0, 3
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = insertelement <4 x float> %a, float %b, i32 3
+  ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj0EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj0EDv4_jjET0_S1_T1_
+; CHECK: mtfprwz 0, 3
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 0
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj1EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj1EDv4_jjET0_S1_T1_
+; CHECK: mtfprwz 0, 3
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 1
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj2EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj2EDv4_jjET0_S1_T1_
+; CHECK: mtfprwz 0, 3
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 2
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj3EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+entry:
+; CHECK-LABEL: _Z10testInsEltILj3EDv4_jjET0_S1_T1_
+; CHECK: mtfprwz 0, 3
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 3
+  ret <4 x i32> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 0
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 2, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 0
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 4
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 4
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 8
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 8
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 3
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_
+; CHECK-NOT: xxsldwi
+; CHECK: xxinsertw 34, 35, 12
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_
+; CHECK: xxsldwi 0, 35, 35, 1
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+entry:
+; CHECK-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
+; CHECK: xxswapd 0, 35
+; CHECK: xxinsertw 34, 0, 12
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x i32> %vecins
+}
+define <4 x float> @testSameVecEl0BE(<4 x float> %a) {
+entry:
+; CHECK-LABEL: testSameVecEl0BE
+; CHECK: xxinsertw 34, 34, 0
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl2BE(<4 x float> %a) {
+entry:
+; CHECK-LABEL: testSameVecEl2BE
+; CHECK: xxinsertw 34, 34, 8
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl3BE(<4 x float> %a) {
+entry:
+; CHECK-LABEL: testSameVecEl3BE
+; CHECK: xxinsertw 34, 34, 12
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x float> %vecins
+}
+define <4 x float> @insertVarF(<4 x float> %a, float %f, i32 %el) {
+entry:
+; CHECK-LABEL: insertVarF
+; CHECK: stfsx 1,
+; CHECK: lxv
+  %vecins = insertelement <4 x float> %a, float %f, i32 %el
+  ret <4 x float> %vecins
+}
+define <4 x i32> @insertVarI(<4 x i32> %a, i32 %i, i32 %el) {
+entry:
+; CHECK-LABEL: insertVarI
+; CHECK: stwx
+; CHECK: lxv
+  %vecins = insertelement <4 x i32> %a, i32 %i, i32 %el
+  ret <4 x i32> %vecins
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-p9-insert-extract.ll b/llvm/test/CodeGen/PowerPC/aix-p9-insert-extract.ll
new file mode 100644
index 0000000000000..1890549c5ec71
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-p9-insert-extract.ll
@@ -0,0 +1,2893 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK-64,CHECK-64-OPT %s
+; RUN: llc -O0 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK-64,CHECK-64-O0 %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-32,CHECK-32-OPT
+; RUN: llc -O0 -mcpu=pwr9 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-32,CHECK-32-O0
+
+; The following testcases take one halfword element from the second vector and
+; inserts it at various locations in the first vector
+define <8 x i16> @shuffle_vector_halfword_0_8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_0_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-64-NEXT:    vinserth 2, 3, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_0_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-32-NEXT:    vinserth 2, 3, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_1_15(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_1_15:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-64-NEXT:    vinserth 2, 3, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_1_15:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-32-NEXT:    vinserth 2, 3, 2
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 15, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_2_9(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_2_9:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-64-NEXT:    vinserth 2, 3, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_2_9:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-32-NEXT:    vinserth 2, 3, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_3_13(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_3_13:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-64-NEXT:    vinserth 2, 3, 6
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_3_13:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-32-NEXT:    vinserth 2, 3, 6
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_4_10(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_4_10:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-64-NEXT:    vinserth 2, 3, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_4_10:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-32-NEXT:    vinserth 2, 3, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_5_14(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_5_14:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-64-NEXT:    vinserth 2, 3, 10
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_5_14:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-32-NEXT:    vinserth 2, 3, 10
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 14, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_6_11(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_6_11:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinserth 2, 3, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_6_11:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinserth 2, 3, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 11, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_7_12(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-LABEL: shuffle_vector_halfword_7_12:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-64-NEXT:    vinserth 2, 3, 14
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_7_12:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-32-NEXT:    vinserth 2, 3, 14
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_8_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_8_1:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 12
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 0
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_8_1:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 0
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_8_1:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 12
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 0
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_8_1:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 0
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+; The following testcases take one halfword element from the first vector and
+; inserts it at various locations in the second vector
+define <8 x i16> @shuffle_vector_halfword_9_7(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_9_7:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 8
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 2
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_9_7:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 2
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_9_7:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 8
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 2
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_9_7:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 2
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 7, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_10_4(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_10_4:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 2
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 4
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_10_4:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 4
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_10_4:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 2
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 4
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_10_4:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 4
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 4, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_11_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_11_2:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 14
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 6
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_11_2:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 6
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_11_2:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 14
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 6
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_11_2:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 6
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 2, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_12_6(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_12_6:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 6
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 8
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_12_6:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 8
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_12_6:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 6
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 8
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_12_6:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 8
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 6, i32 13, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_13_3(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_13_3:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 10
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_13_3:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 10
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_13_3:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 10
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_13_3:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 10
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 3, i32 14, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_14_5(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_14_5:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 4
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 12
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_14_5:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 12
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_14_5:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 4
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 12
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_14_5:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 12
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 5, i32 15>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_15_0(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_halfword_15_0:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 10
+; CHECK-64-OPT-NEXT:    vinserth 3, 2, 14
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_halfword_15_0:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 14
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_halfword_15_0:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 10
+; CHECK-32-OPT-NEXT:    vinserth 3, 2, 14
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_halfword_15_0:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 14
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i16> %vecins
+}
+
+; The following testcases use the same vector in both arguments of the
+; shufflevector.  If halfword element 3 in BE mode(or 4 in LE mode) is the one
+; we're attempting to insert, then we can use the vector insert instruction
+define <8 x i16> @shuffle_vector_halfword_0_4(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinserth 2, 2, 14
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_0_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI16_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI16_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_0_4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C0(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_0_4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C0(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_1_3(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI17_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI17_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_1_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinserth 2, 2, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_1_3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinserth 2, 2, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_1_3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinserth 2, 2, 2
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 3, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_2_3(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI18_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI18_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_2_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinserth 2, 2, 4
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_2_3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinserth 2, 2, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_2_3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinserth 2, 2, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_3_4(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinserth 2, 2, 8
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_3_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI19_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI19_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_3_4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C1(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_3_4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C1(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_4_3(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI20_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI20_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_4_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinserth 2, 2, 8
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_4_3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinserth 2, 2, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_4_3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinserth 2, 2, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 5, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_5_3(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI21_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI21_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_5_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinserth 2, 2, 10
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_5_3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinserth 2, 2, 10
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_5_3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinserth 2, 2, 10
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 3, i32 6, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_6_4(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinserth 2, 2, 2
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_6_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI22_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI22_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_6_4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C2(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_6_4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C2(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 4, i32 7>
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @shuffle_vector_halfword_7_4(<8 x i16> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinserth 2, 2, 0
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_halfword_7_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI23_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI23_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_halfword_7_4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C3(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_halfword_7_4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C3(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>
+  ret <8 x i16> %vecins
+}
+
+; The following testcases take one byte element from the second vector and
+; inserts it at various locations in the first vector
+define <16 x i8> @shuffle_vector_byte_0_16(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-NEXT:    vinsertb 2, 3, 15
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_0_16:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 9
+; CHECK-BE-NEXT:    vinsertb 2, 3, 0
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_0_16:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 9
+; CHECK-64-NEXT:    vinsertb 2, 3, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_0_16:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 9
+; CHECK-32-NEXT:    vinsertb 2, 3, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_1_25(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 15
+; CHECK-NEXT:    vinsertb 2, 3, 14
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_1_25:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-BE-NEXT:    vinsertb 2, 3, 1
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_1_25:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-64-NEXT:    vinsertb 2, 3, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_1_25:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-32-NEXT:    vinsertb 2, 3, 1
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 25, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_2_18(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-NEXT:    vinsertb 2, 3, 13
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_2_18:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 11
+; CHECK-BE-NEXT:    vinsertb 2, 3, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_2_18:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 11
+; CHECK-64-NEXT:    vinsertb 2, 3, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_2_18:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 11
+; CHECK-32-NEXT:    vinsertb 2, 3, 2
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_3_27(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 13
+; CHECK-NEXT:    vinsertb 2, 3, 12
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_3_27:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-BE-NEXT:    vinsertb 2, 3, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_3_27:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-64-NEXT:    vinsertb 2, 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_3_27:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-32-NEXT:    vinsertb 2, 3, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_4_20(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-NEXT:    vinsertb 2, 3, 11
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_4_20:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 13
+; CHECK-BE-NEXT:    vinsertb 2, 3, 4
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_4_20:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 13
+; CHECK-64-NEXT:    vinsertb 2, 3, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_4_20:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 13
+; CHECK-32-NEXT:    vinsertb 2, 3, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_5_29(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 11
+; CHECK-NEXT:    vinsertb 2, 3, 10
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_5_29:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-BE-NEXT:    vinsertb 2, 3, 5
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_5_29:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-64-NEXT:    vinsertb 2, 3, 5
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_5_29:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-32-NEXT:    vinsertb 2, 3, 5
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 29, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_6_22(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-NEXT:    vinsertb 2, 3, 9
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_6_22:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 15
+; CHECK-BE-NEXT:    vinsertb 2, 3, 6
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_6_22:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 15
+; CHECK-64-NEXT:    vinsertb 2, 3, 6
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_6_22:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 15
+; CHECK-32-NEXT:    vinsertb 2, 3, 6
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_7_31(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 9
+; CHECK-NEXT:    vinsertb 2, 3, 8
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_7_31:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-BE-NEXT:    vinsertb 2, 3, 7
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_7_31:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-64-NEXT:    vinsertb 2, 3, 7
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_7_31:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-32-NEXT:    vinsertb 2, 3, 7
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_8_24(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 3, 7
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_8_24:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 1
+; CHECK-BE-NEXT:    vinsertb 2, 3, 8
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_8_24:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 1
+; CHECK-64-NEXT:    vinsertb 2, 3, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_8_24:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 1
+; CHECK-32-NEXT:    vinsertb 2, 3, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_9_17(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 7
+; CHECK-NEXT:    vinsertb 2, 3, 6
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_9_17:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-BE-NEXT:    vinsertb 2, 3, 9
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_9_17:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-64-NEXT:    vinsertb 2, 3, 9
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_9_17:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-32-NEXT:    vinsertb 2, 3, 9
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_10_26(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-NEXT:    vinsertb 2, 3, 5
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_10_26:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 3
+; CHECK-BE-NEXT:    vinsertb 2, 3, 10
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_10_26:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 3
+; CHECK-64-NEXT:    vinsertb 2, 3, 10
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_10_26:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 3
+; CHECK-32-NEXT:    vinsertb 2, 3, 10
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_11_19(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 5
+; CHECK-NEXT:    vinsertb 2, 3, 4
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_11_19:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-BE-NEXT:    vinsertb 2, 3, 11
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_11_19:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-64-NEXT:    vinsertb 2, 3, 11
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_11_19:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-32-NEXT:    vinsertb 2, 3, 11
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 19, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_12_28(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-NEXT:    vinsertb 2, 3, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_12_28:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 5
+; CHECK-BE-NEXT:    vinsertb 2, 3, 12
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_12_28:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 5
+; CHECK-64-NEXT:    vinsertb 2, 3, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_12_28:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 5
+; CHECK-32-NEXT:    vinsertb 2, 3, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_13_21(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 3
+; CHECK-NEXT:    vinsertb 2, 3, 2
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_13_21:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-BE-NEXT:    vinsertb 2, 3, 13
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_13_21:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-64-NEXT:    vinsertb 2, 3, 13
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_13_21:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-32-NEXT:    vinsertb 2, 3, 13
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 21, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_14_30(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-NEXT:    vinsertb 2, 3, 1
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_14_30:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vsldoi 3, 3, 3, 7
+; CHECK-BE-NEXT:    vinsertb 2, 3, 14
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_14_30:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsldoi 3, 3, 3, 7
+; CHECK-64-NEXT:    vinsertb 2, 3, 14
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_14_30:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vsldoi 3, 3, 3, 7
+; CHECK-32-NEXT:    vinsertb 2, 3, 14
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 30, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_15_23(<16 x i8> %a, <16 x i8> %b) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsldoi 3, 3, 3, 1
+; CHECK-NEXT:    vinsertb 2, 3, 0
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_15_23:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 3, 15
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_15_23:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 3, 15
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_15_23:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 3, 15
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23>
+  ret <16 x i8> %vecins
+}
+
+; The following testcases take one byte element from the first vector and
+; inserts it at various locations in the second vector
+define <16 x i8> @shuffle_vector_byte_16_8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_16_8:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 1
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 0
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_16_8:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 1
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 0
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_16_8:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 1
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 0
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_16_8:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 1
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 0
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_17_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_17_1:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 10
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 1
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_17_1:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 1
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_17_1:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 10
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 1
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_17_1:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 10
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 1
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_18_10(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_18_10:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 3
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 2
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_18_10:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 3
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 2
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_18_10:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 3
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 2
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_18_10:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 3
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 2
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 10, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_19_3(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_19_3:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 12
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 3
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_19_3:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 3
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_19_3:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 12
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 3
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_19_3:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 12
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 3
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_20_12(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_20_12:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 5
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 4
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_20_12:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 5
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 4
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_20_12:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 5
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 4
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_20_12:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 5
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 4
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 12, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_21_5(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_21_5:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 14
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 5
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_21_5:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 5
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_21_5:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 14
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 5
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_21_5:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 14
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 5
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_22_14(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_22_14:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 7
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 6
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_22_14:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 7
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 6
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_22_14:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 7
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 6
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_22_14:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 7
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 6
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 14, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_23_7(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_23_7:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 7
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_23_7:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 7
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_23_7:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 7
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_23_7:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 7
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_24_0(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_24_0:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 9
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 8
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_24_0:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 9
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 8
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_24_0:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 9
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 8
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_24_0:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 9
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 8
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_25_9(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_25_9:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 2
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 9
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_25_9:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 9
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_25_9:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 2
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 9
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_25_9:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 2
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 9
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_26_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_26_2:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 11
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 10
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_26_2:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 11
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 10
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_26_2:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 11
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 10
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_26_2:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 11
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 10
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 2, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_27_11(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_27_11:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 4
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 11
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_27_11:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 11
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_27_11:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 4
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 11
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_27_11:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 4
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 11
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 11, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_28_4(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_28_4:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 13
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 12
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_28_4:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 13
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 12
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_28_4:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 13
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 12
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_28_4:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 13
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 12
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 4, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_29_13(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_29_13:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 6
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 13
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_29_13:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 13
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_29_13:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 6
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 13
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_29_13:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 6
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 13
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 13, i32 30, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_30_6(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_30_6:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 15
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 14
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_30_6:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 15
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 14
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_30_6:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 15
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 14
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_30_6:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 15
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 14
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 6, i32 31>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_31_15(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-64-OPT-LABEL: shuffle_vector_byte_31_15:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    vsldoi 2, 2, 2, 8
+; CHECK-64-OPT-NEXT:    vinsertb 3, 2, 15
+; CHECK-64-OPT-NEXT:    vmr 2, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: shuffle_vector_byte_31_15:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-64-O0-NEXT:    vmr 3, 2
+; CHECK-64-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-64-O0-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 15
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: shuffle_vector_byte_31_15:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    vsldoi 2, 2, 2, 8
+; CHECK-32-OPT-NEXT:    vinsertb 3, 2, 15
+; CHECK-32-OPT-NEXT:    vmr 2, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: shuffle_vector_byte_31_15:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    stxv 35, -16(1) # 16-byte Folded Spill
+; CHECK-32-O0-NEXT:    vmr 3, 2
+; CHECK-32-O0-NEXT:    lxv 34, -16(1) # 16-byte Folded Reload
+; CHECK-32-O0-NEXT:    vsldoi 3, 3, 3, 8
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 15
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 15>
+  ret <16 x i8> %vecins
+}
+
+; The following testcases use the same vector in both arguments of the
+; shufflevector.  If byte element 7 in BE mode(or 8 in LE mode) is the one
+; we're attempting to insert, then we can use the vector insert instruction
+define <16 x i8> @shuffle_vector_byte_0_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI56_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI56_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_0_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 0
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_0_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_0_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_1_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 14
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_1_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI57_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI57_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_1_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C4(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_1_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C4(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_2_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 13
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_2_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI58_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI58_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_2_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C5(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_2_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C5(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_3_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI59_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI59_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_3_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_3_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_3_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_4_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI60_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI60_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_4_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 4
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_4_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_4_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_5_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 10
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_5_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI61_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI61_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_5_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C6(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_5_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C6(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_6_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 9
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_6_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI62_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI62_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_6_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C7(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_6_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C7(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_7_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 8
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_7_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI63_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI63_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_7_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C8(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_7_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C8(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_8_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI64_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI64_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_8_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 8
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_8_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_8_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_9_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI65_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI65_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_9_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 9
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_9_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 9
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_9_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 9
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 7, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_10_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI66_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI66_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_10_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 10
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_10_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 10
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_10_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 10
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 7, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_11_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 4
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_11_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI67_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI67_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_11_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C9(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_11_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C9(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 8, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_12_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_12_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI68_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI68_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_12_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C10(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_12_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C10(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 8, i32 13, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_13_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI69_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI69_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_13_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 13
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_13_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 13
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_13_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 13
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 7, i32 14, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_14_7(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI70_0 at toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI70_0 at toc@l
+; CHECK-NEXT:    lxvx 35, 0, 3
+; CHECK-NEXT:    vperm 2, 2, 2, 3
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_14_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    vinsertb 2, 2, 14
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_14_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vinsertb 2, 2, 14
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_14_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    vinsertb 2, 2, 14
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 15>
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @shuffle_vector_byte_15_8(<16 x i8> %a) {
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vinsertb 2, 2, 0
+; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: shuffle_vector_byte_15_8:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    addis 3, 2, .LCPI71_0 at toc@ha
+; CHECK-BE-NEXT:    addi 3, 3, .LCPI71_0 at toc@l
+; CHECK-BE-NEXT:    lxvx 35, 0, 3
+; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    blr
+; CHECK-64-LABEL: shuffle_vector_byte_15_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    ld 3, L..C11(2)
+; CHECK-64-NEXT:    lxvx 35, 0, 3
+; CHECK-64-NEXT:    vperm 2, 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: shuffle_vector_byte_15_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 3, L..C11(2)
+; CHECK-32-NEXT:    lxvx 35, 0, 3
+; CHECK-32-NEXT:    vperm 2, 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+  ret <16 x i8> %vecins
+}
+
+; The following tests try to insert one halfword element into the vector.  We
+; should always be using the 'vinserth' instruction.
+define <8 x i16> @insert_halfword_0(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_0:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 0
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_0:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 0
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_0:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 0
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_0:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 0
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 0
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_1(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_1:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 2
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_1:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 2
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_1:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 2
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_1:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 2
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 1
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_2(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_2:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 4
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_2:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 4
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_2:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 4
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_2:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 4
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 2
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_3(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_3:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 6
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_3:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 6
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_3:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 6
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_3:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 6
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 3
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_4(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_4:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 8
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_4:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 8
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_4:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 8
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_4:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 8
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 4
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_5(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_5:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 10
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_5:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 10
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_5:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 10
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_5:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 10
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 5
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_6(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_6:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 12
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_6:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 12
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_6:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 12
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_6:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 12
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 6
+  ret <8 x i16> %vecins
+}
+
+define <8 x i16> @insert_halfword_7(<8 x i16> %a, i16 %b) {
+; CHECK-64-OPT-LABEL: insert_halfword_7:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinserth 2, 3, 14
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_halfword_7:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinserth 2, 3, 14
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_halfword_7:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinserth 2, 3, 14
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_halfword_7:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinserth 2, 3, 14
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <8 x i16> %a, i16 %b, i32 7
+  ret <8 x i16> %vecins
+}
+
+; The following tests try to insert one byte element into the vector.  We
+; should always be using the 'vinsertb' instruction.
+define <16 x i8> @insert_byte_0(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_0:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 0
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_0:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 0
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_0:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 0
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_0:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 0
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 0
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_1(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_1:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 1
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_1:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 1
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_1:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 1
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_1:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 1
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 1
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_2(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_2:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 2
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_2:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 2
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_2:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 2
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_2:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 2
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 2
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_3(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_3:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 3
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_3:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 3
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_3:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 3
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_3:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 3
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 3
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_4(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_4:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 4
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_4:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 4
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_4:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 4
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_4:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 4
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 4
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_5(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_5:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 5
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_5:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 5
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_5:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 5
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_5:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 5
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 5
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_6(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_6:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 6
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_6:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 6
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_6:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 6
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_6:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 6
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 6
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_7(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_7:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 7
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_7:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 7
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_7:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 7
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_7:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 7
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 7
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_8(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_8:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 8
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_8:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 8
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_8:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 8
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_8:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 8
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 8
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_9(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_9:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 9
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_9:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 9
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_9:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 9
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_9:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 9
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 9
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_10(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_10:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 10
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_10:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 10
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_10:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 10
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_10:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 10
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 10
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_11(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_11:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 11
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_11:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 11
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_11:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 11
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_11:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 11
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 11
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_12(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_12:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 12
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_12:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 12
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_12:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 12
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_12:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 12
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 12
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_13(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_13:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 13
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_13:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 13
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_13:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 13
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_13:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 13
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 13
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_14(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_14:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 14
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_14:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 14
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_14:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 14
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_14:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 14
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 14
+  ret <16 x i8> %vecins
+}
+
+define <16 x i8> @insert_byte_15(<16 x i8> %a, i8 %b) {
+; CHECK-64-OPT-LABEL: insert_byte_15:
+; CHECK-64-OPT:       # %bb.0: # %entry
+; CHECK-64-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-64-OPT-NEXT:    vinsertb 2, 3, 15
+; CHECK-64-OPT-NEXT:    blr
+;
+; CHECK-64-O0-LABEL: insert_byte_15:
+; CHECK-64-O0:       # %bb.0: # %entry
+; CHECK-64-O0-NEXT:    # kill: def $r3 killed $r3 killed $x3
+; CHECK-64-O0-NEXT:    mtfprwz 0, 3
+; CHECK-64-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-64-O0-NEXT:    vinsertb 2, 3, 15
+; CHECK-64-O0-NEXT:    blr
+;
+; CHECK-32-OPT-LABEL: insert_byte_15:
+; CHECK-32-OPT:       # %bb.0: # %entry
+; CHECK-32-OPT-NEXT:    mtvsrwz 35, 3
+; CHECK-32-OPT-NEXT:    vinsertb 2, 3, 15
+; CHECK-32-OPT-NEXT:    blr
+;
+; CHECK-32-O0-LABEL: insert_byte_15:
+; CHECK-32-O0:       # %bb.0: # %entry
+; CHECK-32-O0-NEXT:    # kill: def $r4 killed $r3
+; CHECK-32-O0-NEXT:    mtfprwz 0, 3
+; CHECK-32-O0-NEXT:    xscpsgndp 35, 0, 0
+; CHECK-32-O0-NEXT:    vinsertb 2, 3, 15
+; CHECK-32-O0-NEXT:    blr
+entry:
+  %vecins = insertelement <16 x i8> %a, i8 %b, i32 15
+  ret <16 x i8> %vecins
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
new file mode 100644
index 0000000000000..31c583c9b2c63
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-p9-xxinsertw-xxextractuw.ll
@@ -0,0 +1,1584 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-32
+
+define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj1EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj2EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj3EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj1EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj2EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj3EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define float @_Z13testUiToFpExtILj0EEfDv4_j(<4 x i32> %a) {
+; CHECK-64-LABEL: _Z13testUiToFpExtILj0EEfDv4_j:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 0
+; CHECK-64-NEXT:    xscvuxdsp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z13testUiToFpExtILj0EEfDv4_j:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -32(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxdsp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 0
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj1EEfDv4_j(<4 x i32> %a) {
+; CHECK-64-LABEL: _Z13testUiToFpExtILj1EEfDv4_j:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 4
+; CHECK-64-NEXT:    xscvuxdsp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z13testUiToFpExtILj1EEfDv4_j:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -28(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxdsp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 1
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj2EEfDv4_j(<4 x i32> %a) {
+; CHECK-64-LABEL: _Z13testUiToFpExtILj2EEfDv4_j:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 8
+; CHECK-64-NEXT:    xscvuxdsp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z13testUiToFpExtILj2EEfDv4_j:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -24(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxdsp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 2
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+define float @_Z13testUiToFpExtILj3EEfDv4_j(<4 x i32> %a) {
+; CHECK-64-LABEL: _Z13testUiToFpExtILj3EEfDv4_j:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 12
+; CHECK-64-NEXT:    xscvuxdsp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z13testUiToFpExtILj3EEfDv4_j:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -20(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxdsp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 3
+  %conv = uitofp i32 %vecext to float
+  ret float %conv
+}
+
+; Verify we generate optimal code for unsigned vector int elem extract followed
+; by conversion to double
+
+define double @conv2dlbTestui0(<4 x i32> %a) {
+; CHECK-64-LABEL: conv2dlbTestui0:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 0
+; CHECK-64-NEXT:    xscvuxddp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: conv2dlbTestui0:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -32(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxddp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = extractelement <4 x i32> %a, i32 0
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+define double @conv2dlbTestui1(<4 x i32> %a) {
+; CHECK-64-LABEL: conv2dlbTestui1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 4
+; CHECK-64-NEXT:    xscvuxddp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: conv2dlbTestui1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -28(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxddp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = extractelement <4 x i32> %a, i32 1
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+define double @conv2dlbTestui2(<4 x i32> %a) {
+; CHECK-64-LABEL: conv2dlbTestui2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 8
+; CHECK-64-NEXT:    xscvuxddp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: conv2dlbTestui2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -24(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxddp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = extractelement <4 x i32> %a, i32 2
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+define double @conv2dlbTestui3(<4 x i32> %a) {
+; CHECK-64-LABEL: conv2dlbTestui3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 12
+; CHECK-64-NEXT:    xscvuxddp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: conv2dlbTestui3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwz 3, -20(1)
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxddp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = extractelement <4 x i32> %a, i32 3
+  %1 = uitofp i32 %0 to double
+  ret double %1
+}
+
+; verify we don't crash for variable elem extract
+define double @conv2dlbTestuiVar(<4 x i32> %a, i32 zeroext %elem) {
+; CHECK-64-LABEL: conv2dlbTestuiVar:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    mtfprwz 0, 3
+; CHECK-64-NEXT:    xscvuxddp 1, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: conv2dlbTestuiVar:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -32
+; CHECK-32-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    lwzx 3, 4, 3
+; CHECK-32-NEXT:    stw 3, -4(1)
+; CHECK-32-NEXT:    addi 3, 1, -4
+; CHECK-32-NEXT:    lfiwzx 0, 0, 3
+; CHECK-32-NEXT:    xscvuxddp 1, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %elem
+  %conv = uitofp i32 %vecext to double
+  ret double %conv
+}
+
+define <4 x float> @_Z10testInsEltILj0EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj0EDv4_ffET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xscvdpspn 0, 1
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj0EDv4_ffET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xscvdpspn 0, 1
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %b, i32 0
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj1EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj1EDv4_ffET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xscvdpspn 0, 1
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj1EDv4_ffET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xscvdpspn 0, 1
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %b, i32 1
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj2EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj2EDv4_ffET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xscvdpspn 0, 1
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj2EDv4_ffET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xscvdpspn 0, 1
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %b, i32 2
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z10testInsEltILj3EDv4_ffET0_S1_T1_(<4 x float> %a, float %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj3EDv4_ffET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xscvdpspn 0, 1
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj3EDv4_ffET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xscvdpspn 0, 1
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %b, i32 3
+  ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj0EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj0EDv4_jjET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprwz 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj0EDv4_jjET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 0
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj1EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj1EDv4_jjET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprwz 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj1EDv4_jjET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 1
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj2EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj2EDv4_jjET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprwz 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj2EDv4_jjET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 2
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z10testInsEltILj3EDv4_jjET0_S1_T1_(<4 x i32> %a, i32 zeroext %b) {
+; CHECK-64-LABEL: _Z10testInsEltILj3EDv4_jjET0_S1_T1_:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprwz 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z10testInsEltILj3EDv4_jjET0_S1_T1_:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x i32> %a, i32 %b, i32 3
+  ret <4 x i32> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj1EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj2EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj0ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj1EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj2EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj1ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj1EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj2EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj2ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj1EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj1EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj2EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj2EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @_Z7testInsILj3ELj3EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x float> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj1EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj2EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 2, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj0ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj1EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj2EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 2, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj1ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj1EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj2EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj2ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 0>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj1EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj1EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj2EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj2EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxsldwi 0, 35, 35, 1
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+  ret <4 x i32> %vecins
+}
+
+define <4 x i32> @_Z7testInsILj3ELj3EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-64-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_r:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 0, 35
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_r:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 0, 35
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x i32> %vecins
+}
+define <4 x float> @testSameVecEl0BE(<4 x float> %a) {
+; CHECK-64-LABEL: testSameVecEl0BE:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 34, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testSameVecEl0BE:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 34, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl2BE(<4 x float> %a) {
+; CHECK-64-LABEL: testSameVecEl2BE:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 34, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testSameVecEl2BE:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 34, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl3BE(<4 x float> %a) {
+; CHECK-64-LABEL: testSameVecEl3BE:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 34, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testSameVecEl3BE:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 34, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl0LE(<4 x float> %a) {
+; CHECK-64-LABEL: testSameVecEl0LE:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxspltw 0, 34, 2
+; CHECK-64-NEXT:    xxsldwi 0, 34, 0, 1
+; CHECK-64-NEXT:    xxsldwi 34, 0, 0, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testSameVecEl0LE:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxspltw 0, 34, 2
+; CHECK-32-NEXT:    xxsldwi 0, 34, 0, 1
+; CHECK-32-NEXT:    xxsldwi 34, 0, 0, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl1LE(<4 x float> %a) {
+; CHECK-64-LABEL: testSameVecEl1LE:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxswapd 35, 34
+; CHECK-64-NEXT:    vmrghw 2, 2, 3
+; CHECK-64-NEXT:    vmrghw 2, 2, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testSameVecEl1LE:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxswapd 35, 34
+; CHECK-32-NEXT:    vmrghw 2, 2, 3
+; CHECK-32-NEXT:    vmrghw 2, 2, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %vecins
+}
+define <4 x float> @testSameVecEl3LE(<4 x float> %a) {
+; CHECK-64-LABEL: testSameVecEl3LE:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxspltw 0, 34, 2
+; CHECK-64-NEXT:    xxswapd 1, 34
+; CHECK-64-NEXT:    xxsldwi 34, 1, 0, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testSameVecEl3LE:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxspltw 0, 34, 2
+; CHECK-32-NEXT:    xxswapd 1, 34
+; CHECK-32-NEXT:    xxsldwi 34, 1, 0, 2
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecins
+}
+define <4 x float> @insertVarF(<4 x float> %a, float %f, i32 %el) {
+; CHECK-64-LABEL: insertVarF:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 4, 2, 28, 29
+; CHECK-64-NEXT:    addi 4, 1, -16
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stfsx 1, 4, 3
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: insertVarF:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    rlwinm 3, 4, 2, 28, 29
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    stfsx 1, 4, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %f, i32 %el
+  ret <4 x float> %vecins
+}
+define <4 x i32> @insertVarI(<4 x i32> %a, i32 %i, i32 %el) {
+; CHECK-64-LABEL: insertVarI:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    addi 5, 1, -16
+; CHECK-64-NEXT:    rlwinm 4, 4, 2, 28, 29
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stwx 3, 5, 4
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: insertVarI:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 4, 4, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    stwx 3, 5, 4
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x i32> %a, i32 %i, i32 %el
+  ret <4 x i32> %vecins
+}
+define <4 x i32> @intrinsicInsertTest(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-64-LABEL: intrinsicInsertTest:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxinsertw 34, 35, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: intrinsicInsertTest:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxinsertw 34, 35, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %ans = tail call <4 x i32> @llvm.ppc.vsx.xxinsertw(<4 x i32> %a, <2 x i64> %b, i32 3)
+  ret <4 x i32> %ans
+}
+declare <4 x i32> @llvm.ppc.vsx.xxinsertw(<4 x i32>, <2 x i64>, i32)
+define <2 x i64> @intrinsicExtractTest(<2 x i64> %a) {
+; CHECK-64-LABEL: intrinsicExtractTest:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xxextractuw 0, 34, 5
+; CHECK-64-NEXT:    xxlor 34, 0, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: intrinsicExtractTest:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xxextractuw 0, 34, 5
+; CHECK-32-NEXT:    xxlor 34, 0, 0
+; CHECK-32-NEXT:    blr
+entry:
+  %ans = tail call <2 x i64> @llvm.ppc.vsx.xxextractuw(<2 x i64> %a, i32 5)
+  ret <2 x i64> %ans
+}
+declare <2 x i64>  @llvm.ppc.vsx.xxextractuw(<2 x i64>, i32)

diff  --git a/llvm/test/CodeGen/PowerPC/aix-vec_extract_p9.ll b/llvm/test/CodeGen/PowerPC/aix-vec_extract_p9.ll
new file mode 100644
index 0000000000000..0d57bd5bdcd3f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_extract_p9.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-64
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-ibm-aix-xcoff -vec-extabi -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-32
+
+define zeroext i8 @test1(<16 x i8> %a, i32 signext %index) {
+; CHECK-64-LABEL: test1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vextublx 3, 3, 2
+; CHECK-64-NEXT:    clrldi 3, 3, 56
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    clrlwi 3, 3, 28
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lbzx 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 %index
+  ret i8 %vecext
+}
+
+define signext i8 @test2(<16 x i8> %a, i32 signext %index) {
+; CHECK-64-LABEL: test2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vextublx 3, 3, 2
+; CHECK-64-NEXT:    extsb 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    clrlwi 3, 3, 28
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lbzx 3, 4, 3
+; CHECK-32-NEXT:    extsb 3, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 %index
+  ret i8 %vecext
+}
+
+define zeroext i16 @test3(<8 x i16> %a, i32 signext %index) {
+; CHECK-64-LABEL: test3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-64-NEXT:    vextuhlx 3, 3, 2
+; CHECK-64-NEXT:    clrldi 3, 3, 48
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lhzx 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 %index
+  ret i16 %vecext
+}
+
+define signext i16 @test4(<8 x i16> %a, i32 signext %index) {
+; CHECK-64-LABEL: test4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-64-NEXT:    vextuhlx 3, 3, 2
+; CHECK-64-NEXT:    extsh 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lhax 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 %index
+  ret i16 %vecext
+}
+
+define zeroext i32 @test5(<4 x i32> %a, i32 signext %index) {
+; CHECK-64-LABEL: test5:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test5:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwzx 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %index
+  ret i32 %vecext
+}
+
+define signext i32 @test6(<4 x i32> %a, i32 signext %index) {
+; CHECK-64-LABEL: test6:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test6:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwzx 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %index
+  ret i32 %vecext
+}
+
+; Test with immediate index
+define zeroext i8 @test7(<16 x i8> %a) {
+; CHECK-64-LABEL: test7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 3, 1
+; CHECK-64-NEXT:    vextublx 3, 3, 2
+; CHECK-64-NEXT:    clrldi 3, 3, 56
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lbz 3, -15(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 1
+  ret i8 %vecext
+}
+
+define zeroext i16 @test8(<8 x i16> %a) {
+; CHECK-64-LABEL: test8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 3, 2
+; CHECK-64-NEXT:    vextuhlx 3, 3, 2
+; CHECK-64-NEXT:    clrldi 3, 3, 48
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lhz 3, -14(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 1
+  ret i16 %vecext
+}
+
+define zeroext i32 @test9(<4 x i32> %a) {
+; CHECK-64-LABEL: test9:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    li 3, 12
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test9:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 3, -4(1)
+; CHECK-32-NEXT:    blr
+  %vecext = extractelement <4 x i32> %a, i32 3
+  ret i32 %vecext
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-vec_extract_p9_2.ll b/llvm/test/CodeGen/PowerPC/aix-vec_extract_p9_2.ll
new file mode 100644
index 0000000000000..eac2330d92f96
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_extract_p9_2.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-64
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-ibm-aix-xcoff -vec-extabi -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-32
+
+define zeroext i8 @test_add1(<16 x i8> %a, i32 signext %index, i8 zeroext %c) {
+; CHECK-64-LABEL: test_add1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vextublx 3, 3, 2
+; CHECK-64-NEXT:    add 3, 3, 4
+; CHECK-64-NEXT:    clrldi 3, 3, 56
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test_add1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    clrlwi 3, 3, 28
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lbzx 3, 5, 3
+; CHECK-32-NEXT:    add 3, 3, 4
+; CHECK-32-NEXT:    clrlwi 3, 3, 24
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 %index
+  %conv = zext i8 %vecext to i32
+  %conv1 = zext i8 %c to i32
+  %add = add nuw nsw i32 %conv, %conv1
+  %conv2 = trunc i32 %add to i8
+  ret i8 %conv2
+}
+
+define signext i8 @test_add2(<16 x i8> %a, i32 signext %index, i8 signext %c) {
+; CHECK-64-LABEL: test_add2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vextublx 3, 3, 2
+; CHECK-64-NEXT:    add 3, 3, 4
+; CHECK-64-NEXT:    extsb 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test_add2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    clrlwi 3, 3, 28
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lbzx 3, 5, 3
+; CHECK-32-NEXT:    add 3, 3, 4
+; CHECK-32-NEXT:    extsb 3, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 %index
+  %conv3 = zext i8 %vecext to i32
+  %conv14 = zext i8 %c to i32
+  %add = add nuw nsw i32 %conv3, %conv14
+  %conv2 = trunc i32 %add to i8
+  ret i8 %conv2
+}
+
+define zeroext i16 @test_add3(<8 x i16> %a, i32 signext %index, i16 zeroext %c) {
+; CHECK-64-LABEL: test_add3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-64-NEXT:    vextuhlx 3, 3, 2
+; CHECK-64-NEXT:    add 3, 3, 4
+; CHECK-64-NEXT:    clrldi 3, 3, 48
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test_add3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lhzx 3, 5, 3
+; CHECK-32-NEXT:    add 3, 3, 4
+; CHECK-32-NEXT:    clrlwi 3, 3, 16
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 %index
+  %conv = zext i16 %vecext to i32
+  %conv1 = zext i16 %c to i32
+  %add = add nuw nsw i32 %conv, %conv1
+  %conv2 = trunc i32 %add to i16
+  ret i16 %conv2
+}
+
+define signext i16 @test_add4(<8 x i16> %a, i32 signext %index, i16 signext %c) {
+; CHECK-64-LABEL: test_add4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-64-NEXT:    vextuhlx 3, 3, 2
+; CHECK-64-NEXT:    add 3, 3, 4
+; CHECK-64-NEXT:    extsh 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test_add4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 1, 28, 30
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lhzx 3, 5, 3
+; CHECK-32-NEXT:    add 3, 3, 4
+; CHECK-32-NEXT:    extsh 3, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 %index
+  %conv5 = zext i16 %vecext to i32
+  %conv16 = zext i16 %c to i32
+  %add = add nuw nsw i32 %conv5, %conv16
+  %conv2 = trunc i32 %add to i16
+  ret i16 %conv2
+}
+
+define zeroext i32 @test_add5(<4 x i32> %a, i32 signext %index, i32 zeroext %c) {
+; CHECK-64-LABEL: test_add5:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    add 3, 3, 4
+; CHECK-64-NEXT:    clrldi 3, 3, 32
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test_add5:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwzx 3, 5, 3
+; CHECK-32-NEXT:    add 3, 3, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %index
+  %add = add i32 %vecext, %c
+  ret i32 %add
+}
+
+define signext i32 @test_add6(<4 x i32> %a, i32 signext %index, i32 signext %c) {
+; CHECK-64-LABEL: test_add6:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    add 3, 3, 4
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test_add6:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwzx 3, 5, 3
+; CHECK-32-NEXT:    add 3, 3, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 %index
+  %add = add nsw i32 %vecext, %c
+  ret i32 %add
+}
+
+; When extracting word element 2 on LE, it's better to use mfvsrwz rather than vextuwrx
+define zeroext i32 @test7(<4 x i32> %a) {
+; CHECK-64-LABEL: test7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 3, 8
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 3, -8(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 2
+  ret i32 %vecext
+}
+
+define zeroext i32 @testadd_7(<4 x i32> %a, i32 zeroext %c) {
+; CHECK-64-LABEL: testadd_7:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 4, 8
+; CHECK-64-NEXT:    vextuwlx 4, 4, 2
+; CHECK-64-NEXT:    add 3, 4, 3
+; CHECK-64-NEXT:    clrldi 3, 3, 32
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testadd_7:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 4, -8(1)
+; CHECK-32-NEXT:    add 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 2
+  %add = add i32 %vecext, %c
+  ret i32 %add
+}
+
+define signext i32 @test8(<4 x i32> %a) {
+; CHECK-64-LABEL: test8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 3, 8
+; CHECK-64-NEXT:    vextuwlx 3, 3, 2
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 3, -8(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 2
+  ret i32 %vecext
+}
+
+define signext i32 @testadd_8(<4 x i32> %a, i32 signext %c) {
+; CHECK-64-LABEL: testadd_8:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 4, 8
+; CHECK-64-NEXT:    vextuwlx 4, 4, 2
+; CHECK-64-NEXT:    add 3, 4, 3
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testadd_8:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 4, -8(1)
+; CHECK-32-NEXT:    add 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 2
+  %add = add nsw i32 %vecext, %c
+  ret i32 %add
+}
+
+; When extracting word element 1 on BE, it's better to use mfvsrwz rather than vextuwlx
+define signext i32 @test9(<4 x i32> %a) {
+; CHECK-64-LABEL: test9:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mfvsrwz 3, 34
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: test9:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 3, -12(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 1
+  ret i32 %vecext
+}
+
+define signext i32 @testadd_9(<4 x i32> %a, i32 signext %c) {
+; CHECK-64-LABEL: testadd_9:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mfvsrwz 4, 34
+; CHECK-64-NEXT:    add 3, 4, 3
+; CHECK-64-NEXT:    extsw 3, 3
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testadd_9:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    lwz 4, -12(1)
+; CHECK-32-NEXT:    add 3, 4, 3
+; CHECK-32-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 1
+  %add = add nsw i32 %vecext, %c
+  ret i32 %add
+}

diff  --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
new file mode 100644
index 0000000000000..80d2bb11c992b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
@@ -0,0 +1,611 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-64
+; RUN: llc -verify-machineinstrs -mtriple=powerpc-ibm-aix-xcoff -vec-extabi -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-32
+
+; Byte indexed
+
+define <16 x i8> @testByte(<16 x i8> %a, i64 %b, i64 %idx) {
+; CHECK-64-LABEL: testByte:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    addi 5, 1, -16
+; CHECK-64-NEXT:    clrldi 4, 4, 60
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stbx 3, 5, 4
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testByte:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    clrlwi 3, 6, 28
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    stbx 4, 5, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %conv = trunc i64 %b to i8
+  %vecins = insertelement <16 x i8> %a, i8 %conv, i64 %idx
+  ret <16 x i8> %vecins
+}
+
+; Halfword indexed
+
+define <8 x i16> @testHalf(<8 x i16> %a, i64 %b, i64 %idx) {
+; CHECK-64-LABEL: testHalf:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    addi 5, 1, -16
+; CHECK-64-NEXT:    rlwinm 4, 4, 1, 28, 30
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    sthx 3, 5, 4
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testHalf:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 6, 1, 28, 30
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    sthx 4, 5, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %conv = trunc i64 %b to i16
+  %vecins = insertelement <8 x i16> %a, i16 %conv, i64 %idx
+  ret <8 x i16> %vecins
+}
+
+; Word indexed
+
+define <4 x i32> @testWord(<4 x i32> %a, i64 %b, i64 %idx) {
+; CHECK-64-LABEL: testWord:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    addi 5, 1, -16
+; CHECK-64-NEXT:    rlwinm 4, 4, 2, 28, 29
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stwx 3, 5, 4
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testWord:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 6, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    stwx 4, 5, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %conv = trunc i64 %b to i32
+  %vecins = insertelement <4 x i32> %a, i32 %conv, i64 %idx
+  ret <4 x i32> %vecins
+}
+
+; Word immediate
+
+define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) {
+; CHECK-64-LABEL: testWordImm:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprwz 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 4
+; CHECK-64-NEXT:    xxinsertw 34, 0, 12
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testWordImm:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 4
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %conv = trunc i64 %b to i32
+  %vecins = insertelement <4 x i32> %a, i32 %conv, i32 1
+  %vecins2 = insertelement <4 x i32> %vecins, i32 %conv, i32 3
+  ret <4 x i32> %vecins2
+}
+
+; Doubleword indexed
+
+define <2 x i64> @testDoubleword(<2 x i64> %a, i64 %b, i64 %idx) {
+; CHECK-64-LABEL: testDoubleword:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    addi 5, 1, -16
+; CHECK-64-NEXT:    rlwinm 4, 4, 3, 28, 28
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stdx 3, 5, 4
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoubleword:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    add 5, 6, 6
+; CHECK-32-NEXT:    addi 7, 1, -32
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    rlwinm 6, 5, 2, 28, 29
+; CHECK-32-NEXT:    stwx 3, 7, 6
+; CHECK-32-NEXT:    addi 3, 5, 1
+; CHECK-32-NEXT:    addi 5, 1, -16
+; CHECK-32-NEXT:    lxv 0, -32(1)
+; CHECK-32-NEXT:    rlwinm 3, 3, 2, 28, 29
+; CHECK-32-NEXT:    stxv 0, -16(1)
+; CHECK-32-NEXT:    stwx 4, 5, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <2 x i64> %a, i64 %b, i64 %idx
+  ret <2 x i64> %vecins
+}
+
+; Doubleword immediate
+
+define <2 x i64> @testDoublewordImm(<2 x i64> %a, i64 %b) {
+; CHECK-64-LABEL: testDoublewordImm:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprd 0, 3
+; CHECK-64-NEXT:    xxmrghd 34, 34, 0
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoublewordImm:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    mtfprwz 0, 4
+; CHECK-32-NEXT:    xxinsertw 34, 0, 12
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <2 x i64> %a, i64 %b, i32 1
+  ret <2 x i64> %vecins
+}
+
+define <2 x i64> @testDoublewordImm2(<2 x i64> %a, i64 %b) {
+; CHECK-64-LABEL: testDoublewordImm2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    mtfprd 0, 3
+; CHECK-64-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoublewordImm2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    mtfprwz 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    mtfprwz 0, 4
+; CHECK-32-NEXT:    xxinsertw 34, 0, 4
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <2 x i64> %a, i64 %b, i32 0
+  ret <2 x i64> %vecins
+}
+
+; Float indexed
+
+define <4 x float> @testFloat1(<4 x float> %a, float %b, i32 zeroext %idx1) {
+; CHECK-64-LABEL: testFloat1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 4, 2, 28, 29
+; CHECK-64-NEXT:    addi 4, 1, -16
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stfsx 1, 4, 3
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testFloat1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    rlwinm 3, 4, 2, 28, 29
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    stfsx 1, 4, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %b, i32 %idx1
+  ret <4 x float> %vecins
+}
+
+define <4 x float> @testFloat2(<4 x float> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-64-LABEL: testFloat2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-DAG:     lwz 6, 0(3)
+; CHECK-64-DAG:     rlwinm 4, 4, 2, 28, 29
+; CHECK-64-DAG:     addi 7, 1, -32
+; CHECK-64-DAG:     stxv 34, -32(1)
+; CHECK-64-DAG:     stwx 6, 7, 4
+; CHECK-64-DAG:     rlwinm 4, 5, 2, 28, 29
+; CHECK-64-DAG:     addi 5, 1, -16
+; CHECK-64-DAG:     lxv 0, -32(1)
+; CHECK-64-DAG:     lwz 3, 1(3)
+; CHECK-64-DAG:     stxv 0, -16(1)
+; CHECK-64-DAG:     stwx 3, 5, 4
+; CHECK-64-DAG:     lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testFloat2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lwz 6, 0(3)
+; CHECK-32-NEXT:    addi 7, 1, -32
+; CHECK-32-NEXT:    rlwinm 4, 4, 2, 28, 29
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    rlwinm 5, 5, 2, 28, 29
+; CHECK-32-NEXT:    stwx 6, 7, 4
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    lxv 0, -32(1)
+; CHECK-32-NEXT:    lwz 3, 1(3)
+; CHECK-32-NEXT:    stxv 0, -16(1)
+; CHECK-32-NEXT:    stwx 3, 4, 5
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = bitcast i8* %b to float*
+  %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 1
+  %1 = bitcast i8* %add.ptr1 to float*
+  %2 = load float, float* %0, align 4
+  %vecins = insertelement <4 x float> %a, float %2, i32 %idx1
+  %3 = load float, float* %1, align 4
+  %vecins2 = insertelement <4 x float> %vecins, float %3, i32 %idx2
+  ret <4 x float> %vecins2
+}
+
+define <4 x float> @testFloat3(<4 x float> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-64-LABEL: testFloat3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-DAG:     lis 6, 1
+; CHECK-64-DAG:     rlwinm 4, 4, 2, 28, 29
+; CHECK-64-DAG:     addi 7, 1, -32
+; CHECK-64-DAG:     lwzx 6, 3, 6
+; CHECK-64-DAG:     stxv 34, -32(1)
+; CHECK-64-DAG:     stwx 6, 7, 4
+; CHECK-64-DAG:     li 4, 1
+; CHECK-64-DAG:     lxv 0, -32(1)
+; CHECK-64-DAG:     rldic 4, 4, 36, 27
+; CHECK-64-DAG:     lwzx 3, 3, 4
+; CHECK-64-DAG:     rlwinm 4, 5, 2, 28, 29
+; CHECK-64-DAG:     addi 5, 1, -16
+; CHECK-64-DAG:     stxv 0, -16(1)
+; CHECK-64-DAG:     stwx 3, 5, 4
+; CHECK-64-DAG:     lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testFloat3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lis 6, 1
+; CHECK-32-NEXT:    addi 7, 1, -32
+; CHECK-32-NEXT:    rlwinm 4, 4, 2, 28, 29
+; CHECK-32-NEXT:    rlwinm 5, 5, 2, 28, 29
+; CHECK-32-NEXT:    lwzx 6, 3, 6
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    stwx 6, 7, 4
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    lxv 0, -32(1)
+; CHECK-32-NEXT:    lwz 3, 0(3)
+; CHECK-32-NEXT:    stxv 0, -16(1)
+; CHECK-32-NEXT:    stwx 3, 4, 5
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %b, i64 65536
+  %0 = bitcast i8* %add.ptr to float*
+  %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 68719476736
+  %1 = bitcast i8* %add.ptr1 to float*
+  %2 = load float, float* %0, align 4
+  %vecins = insertelement <4 x float> %a, float %2, i32 %idx1
+  %3 = load float, float* %1, align 4
+  %vecins2 = insertelement <4 x float> %vecins, float %3, i32 %idx2
+  ret <4 x float> %vecins2
+}
+
+; Float immediate
+
+define <4 x float> @testFloatImm1(<4 x float> %a, float %b) {
+; CHECK-64-LABEL: testFloatImm1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    xscvdpspn 0, 1
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testFloatImm1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    xscvdpspn 0, 1
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <4 x float> %a, float %b, i32 0
+  %vecins1 = insertelement <4 x float> %vecins, float %b, i32 2
+  ret <4 x float> %vecins1
+}
+
+define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) {
+; CHECK-64-LABEL: testFloatImm2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    lfs 0, 0(3)
+; CHECK-64-NEXT:    xscvdpspn 0, 0
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    lfs 0, 4(3)
+; CHECK-64-NEXT:    xscvdpspn 0, 0
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testFloatImm2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lfs 0, 0(3)
+; CHECK-32-NEXT:    xscvdpspn 0, 0
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    lfs 0, 4(3)
+; CHECK-32-NEXT:    xscvdpspn 0, 0
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = bitcast i32* %b to float*
+  %add.ptr1 = getelementptr inbounds i32, i32* %b, i64 1
+  %1 = bitcast i32* %add.ptr1 to float*
+  %2 = load float, float* %0, align 4
+  %vecins = insertelement <4 x float> %a, float %2, i32 0
+  %3 = load float, float* %1, align 4
+  %vecins2 = insertelement <4 x float> %vecins, float %3, i32 2
+  ret <4 x float> %vecins2
+}
+
+define <4 x float> @testFloatImm3(<4 x float> %a, i32* %b) {
+; CHECK-64-LABEL: testFloatImm3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    lis 4, 4
+; CHECK-64-NEXT:    lfsx 0, 3, 4
+; CHECK-64-NEXT:    li 4, 1
+; CHECK-64-NEXT:    rldic 4, 4, 38, 25
+; CHECK-64-NEXT:    xscvdpspn 0, 0
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 0
+; CHECK-64-NEXT:    lfsx 0, 3, 4
+; CHECK-64-NEXT:    xscvdpspn 0, 0
+; CHECK-64-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-64-NEXT:    xxinsertw 34, 0, 8
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testFloatImm3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lis 4, 4
+; CHECK-32-NEXT:    lfsx 0, 3, 4
+; CHECK-32-NEXT:    xscvdpspn 0, 0
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 0
+; CHECK-32-NEXT:    lfs 0, 0(3)
+; CHECK-32-NEXT:    xscvdpspn 0, 0
+; CHECK-32-NEXT:    xxsldwi 0, 0, 0, 3
+; CHECK-32-NEXT:    xxinsertw 34, 0, 8
+; CHECK-32-NEXT:    blr
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536
+  %0 = bitcast i32* %add.ptr to float*
+  %add.ptr1 = getelementptr inbounds i32, i32* %b, i64 68719476736
+  %1 = bitcast i32* %add.ptr1 to float*
+  %2 = load float, float* %0, align 4
+  %vecins = insertelement <4 x float> %a, float %2, i32 0
+  %3 = load float, float* %1, align 4
+  %vecins2 = insertelement <4 x float> %vecins, float %3, i32 2
+  ret <4 x float> %vecins2
+}
+
+; Double indexed
+
+define <2 x double> @testDouble1(<2 x double> %a, double %b, i32 zeroext %idx1) {
+; CHECK-64-LABEL: testDouble1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    rlwinm 3, 4, 3, 28, 28
+; CHECK-64-NEXT:    addi 4, 1, -16
+; CHECK-64-NEXT:    stxv 34, -16(1)
+; CHECK-64-NEXT:    stfdx 1, 4, 3
+; CHECK-64-NEXT:    lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDouble1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    addi 4, 1, -16
+; CHECK-32-NEXT:    rlwinm 3, 5, 3, 28, 28
+; CHECK-32-NEXT:    stxv 34, -16(1)
+; CHECK-32-NEXT:    stfdx 1, 4, 3
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <2 x double> %a, double %b, i32 %idx1
+  ret <2 x double> %vecins
+}
+
+define <2 x double> @testDouble2(<2 x double> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-64-LABEL: testDouble2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-DAG:     ld 6, 0(3)
+; CHECK-64-DAG:     rlwinm 4, 4, 3, 28, 28
+; CHECK-64-DAG:     addi 7, 1, -32
+; CHECK-64-DAG:     stxv 34, -32(1)
+; CHECK-64-DAG:     stdx 6, 7, 4
+; CHECK-64-DAG:     li 4, 1
+; CHECK-64-DAG:     lxv 0, -32(1)
+; CHECK-64-DAG:     ldx 3, 3, 4
+; CHECK-64-DAG:     rlwinm 4, 5, 3, 28, 28
+; CHECK-64-DAG:     addi 5, 1, -16
+; CHECK-64-DAG:     stxv 0, -16(1)
+; CHECK-64-DAG:     stdx 3, 5, 4
+; CHECK-64-DAG:     lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDouble2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lfd 0, 0(3)
+; CHECK-32-NEXT:    addi 6, 1, -32
+; CHECK-32-NEXT:    rlwinm 4, 4, 3, 28, 28
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    rlwinm 5, 5, 3, 28, 28
+; CHECK-32-NEXT:    stfdx 0, 6, 4
+; CHECK-32-NEXT:    lxv 0, -32(1)
+; CHECK-32-NEXT:    lfd 1, 1(3)
+; CHECK-32-NEXT:    addi 3, 1, -16
+; CHECK-32-NEXT:    stxv 0, -16(1)
+; CHECK-32-NEXT:    stfdx 1, 3, 5
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = bitcast i8* %b to double*
+  %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 1
+  %1 = bitcast i8* %add.ptr1 to double*
+  %2 = load double, double* %0, align 8
+  %vecins = insertelement <2 x double> %a, double %2, i32 %idx1
+  %3 = load double, double* %1, align 8
+  %vecins2 = insertelement <2 x double> %vecins, double %3, i32 %idx2
+  ret <2 x double> %vecins2
+}
+
+define <2 x double> @testDouble3(<2 x double> %a, i8* %b, i32 zeroext %idx1, i32 zeroext %idx2) {
+; CHECK-64-LABEL: testDouble3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-DAG:     lis 6, 1
+; CHECK-64-DAG:     rlwinm 4, 4, 3, 28, 28
+; CHECK-64-DAG:     addi 7, 1, -32
+; CHECK-64-DAG:     ldx 6, 3, 6
+; CHECK-64-DAG:     stxv 34, -32(1)
+; CHECK-64-DAG:     stdx 6, 7, 4
+; CHECK-64-DAG:     li 4, 1
+; CHECK-64-DAG:     lxv 0, -32(1)
+; CHECK-64-DAG:     rldic 4, 4, 36, 27
+; CHECK-64-DAG:     ldx 3, 3, 4
+; CHECK-64-DAG:     rlwinm 4, 5, 3, 28, 28
+; CHECK-64-DAG:     addi 5, 1, -16
+; CHECK-64-DAG:     stxv 0, -16(1)
+; CHECK-64-DAG:     stdx 3, 5, 4
+; CHECK-64-DAG:     lxv 34, -16(1)
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDouble3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lis 6, 1
+; CHECK-32-NEXT:    rlwinm 4, 4, 3, 28, 28
+; CHECK-32-NEXT:    rlwinm 5, 5, 3, 28, 28
+; CHECK-32-NEXT:    lfdx 0, 3, 6
+; CHECK-32-NEXT:    addi 6, 1, -32
+; CHECK-32-NEXT:    stxv 34, -32(1)
+; CHECK-32-NEXT:    stfdx 0, 6, 4
+; CHECK-32-NEXT:    lxv 0, -32(1)
+; CHECK-32-NEXT:    lfd 1, 0(3)
+; CHECK-32-NEXT:    addi 3, 1, -16
+; CHECK-32-NEXT:    stxv 0, -16(1)
+; CHECK-32-NEXT:    stfdx 1, 3, 5
+; CHECK-32-NEXT:    lxv 34, -16(1)
+; CHECK-32-NEXT:    blr
+entry:
+  %add.ptr = getelementptr inbounds i8, i8* %b, i64 65536
+  %0 = bitcast i8* %add.ptr to double*
+  %add.ptr1 = getelementptr inbounds i8, i8* %b, i64 68719476736
+  %1 = bitcast i8* %add.ptr1 to double*
+  %2 = load double, double* %0, align 8
+  %vecins = insertelement <2 x double> %a, double %2, i32 %idx1
+  %3 = load double, double* %1, align 8
+  %vecins2 = insertelement <2 x double> %vecins, double %3, i32 %idx2
+  ret <2 x double> %vecins2
+}
+
+; Double immediate
+
+define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
+; CHECK-64-LABEL: testDoubleImm1:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; CHECK-64-NEXT:    xxpermdi 34, 1, 34, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoubleImm1:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; CHECK-32-NEXT:    xxpermdi 34, 1, 34, 1
+; CHECK-32-NEXT:    blr
+entry:
+  %vecins = insertelement <2 x double> %a, double %b, i32 0
+  ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm2(<2 x double> %a, i32* %b) {
+; CHECK-64-LABEL: testDoubleImm2:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    lfd 0, 0(3)
+; CHECK-64-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoubleImm2:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lfd 0, 0(3)
+; CHECK-32-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-32-NEXT:    blr
+entry:
+  %0 = bitcast i32* %b to double*
+  %1 = load double, double* %0, align 8
+  %vecins = insertelement <2 x double> %a, double %1, i32 0
+  ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm3(<2 x double> %a, i32* %b) {
+; CHECK-64-LABEL: testDoubleImm3:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    lfd 0, 4(3)
+; CHECK-64-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoubleImm3:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lfd 0, 4(3)
+; CHECK-32-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-32-NEXT:    blr
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %b, i64 1
+  %0 = bitcast i32* %add.ptr to double*
+  %1 = load double, double* %0, align 8
+  %vecins = insertelement <2 x double> %a, double %1, i32 0
+  ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm4(<2 x double> %a, i32* %b) {
+; CHECK-64-LABEL: testDoubleImm4:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    lis 4, 4
+; CHECK-64-NEXT:    lfdx 0, 3, 4
+; CHECK-64-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoubleImm4:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lis 4, 4
+; CHECK-32-NEXT:    lfdx 0, 3, 4
+; CHECK-32-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-32-NEXT:    blr
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536
+  %0 = bitcast i32* %add.ptr to double*
+  %1 = load double, double* %0, align 8
+  %vecins = insertelement <2 x double> %a, double %1, i32 0
+  ret <2 x double> %vecins
+}
+
+define <2 x double> @testDoubleImm5(<2 x double> %a, i32* %b) {
+; CHECK-64-LABEL: testDoubleImm5:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    li 4, 1
+; CHECK-64-NEXT:    rldic 4, 4, 38, 25
+; CHECK-64-NEXT:    lfdx 0, 3, 4
+; CHECK-64-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-64-NEXT:    blr
+;
+; CHECK-32-LABEL: testDoubleImm5:
+; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    lfd 0, 0(3)
+; CHECK-32-NEXT:    xxpermdi 34, 0, 34, 1
+; CHECK-32-NEXT:    blr
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %b, i64 68719476736
+  %0 = bitcast i32* %add.ptr to double*
+  %1 = load double, double* %0, align 8
+  %vecins = insertelement <2 x double> %a, double %1, i32 0
+  ret <2 x double> %vecins
+}
+

diff  --git a/llvm/test/CodeGen/PowerPC/vec-bswap.ll b/llvm/test/CodeGen/PowerPC/vec-bswap.ll
index 7ff5b97780b2b..d351fa3edc5bf 100644
--- a/llvm/test/CodeGen/PowerPC/vec-bswap.ll
+++ b/llvm/test/CodeGen/PowerPC/vec-bswap.ll
@@ -1,10 +1,25 @@
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 \
 ; RUN:   -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s
+
+; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -mcpu=pwr9 \
+; RUN:   -verify-machineinstrs -vec-extabi | \
+; RUN:   FileCheck %s --check-prefixes=AIX,AIX64
+; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff -mcpu=pwr9 \
+; RUN:   -verify-machineinstrs  -vec-extabi | \
+; RUN:   FileCheck %s --check-prefixes=AIX,AIX32
+
 define dso_local void @test(i32* %Arr, i32 signext %Len) {
 ; CHECK-LABEL: test:
 ; CHECK:         lxvx [[REG:vs[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}
 ; CHECK-NOT:     [[REG]]
 ; CHECK:         xxbrw vs{{[0-9]+}}, [[REG]]
+
+; AIX-LABEL:     test:
+; AIX64:         lxvx [[REG64:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; AIX32:         lxv [[REG32:[0-9]+]], {{[0-9]+}}({{[0-9]+}})
+; AIX64-NOT:     [[REG64]]
+; AIX64:         xxbrw {{[0-9]+}}, [[REG64]]
+; AIX32:         xxbrw {{[0-9]+}}, [[REG32]]
 entry:
   %cmp1 = icmp slt i32 0, %Len
   br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup
@@ -77,6 +92,10 @@ define dso_local <8 x i16> @test_halfword(<8 x i16> %a) local_unnamed_addr {
 ; CHECK-LABEL: test_halfword:
 ; CHECK:       xxbrh vs34, vs34
 ; CHECK-NEXT:  blr
+
+; AIX-LABEL:   test_halfword:
+; AIX:         xxbrh 34, 34
+; AIX-NEXT:    blr
 entry:
   %0 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a)
   ret <8 x i16> %0
@@ -86,6 +105,10 @@ define dso_local <2 x i64> @test_doubleword(<2 x i64> %a) local_unnamed_addr {
 ; CHECK-LABEL: test_doubleword:
 ; CHECK:       xxbrd vs34, vs34
 ; CHECK-NEXT:  blr
+
+; AIX-LABEL:   test_doubleword:
+; AIX:         xxbrd 34, 34
+; AIX-NEXT:    blr
 entry:
   %0 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a)
   ret <2 x i64> %0
@@ -95,6 +118,10 @@ define dso_local <1 x i128> @test_quadword(<1 x i128> %a) local_unnamed_addr {
 ; CHECK-LABEL: test_quadword:
 ; CHECK:       xxbrq vs34, vs34
 ; CHECK-NEXT:  blr
+
+; AIX-LABEL:   test_quadword:
+; AIX:         xxbrq 34, 34
+; AIX-NEXT:    blr
 entry:
   %0 = call <1 x i128> @llvm.bswap.v1i128(<1 x i128> %a)
   ret <1 x i128> %0


        


More information about the llvm-commits mailing list