[llvm] r307169 - [Power9] Exploit vector integer extend instructions when indices aren't correct.

Wed Jul 5 09:00:38 PDT 2017

Author: jtony
Date: Wed Jul  5 09:00:38 2017
New Revision: 307169

URL: http://llvm.org/viewvc/llvm-project?rev=307169&view=rev
Log:
[Power9] Exploit vector integer extend instructions when indices aren't correct.

This patch adds on to the exploitation added by https://reviews.llvm.org/D33510.
This now catches build vector nodes where the inputs are coming from sign
extended vector extract elements where the indices used by the vector extract
are not correct. We can still use the new hardware instructions by adding a
shuffle to move the elements to the correct indices. I introduced a new PPCISD
node here because adding a vector_shuffle and changing the elements of the
vector_extracts was getting undone by another DAG combine.

Commit on behalf of Zaara Syeda (syzaara at ca.ibm.com)
Differential Revision: https://reviews.llvm.org/D34009

Modified:
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
    llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td
    llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td
    llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=307169&r1=307168&r2=307169&view=diff
==============================================================================

--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Wed Jul  5 09:00:38 2017
@@ -1168,6 +1168,7 @@ const char *PPCTargetLowering::getTarget
   case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
   case PPCISD::STXSIX:          return "PPCISD::STXSIX";
   case PPCISD::VEXTS:           return "PPCISD::VEXTS";
+  case PPCISD::SExtVElems:      return "PPCISD::SExtVElems";
   case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
   case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
@@ -11311,6 +11312,132 @@ static SDValue combineBVOfConsecutiveLoa
   return SDValue();
 }
 
+// This function adds the required vector_shuffle needed to get
+// the elements of the vector extract in the correct position
+// as specified by the CorrectElems encoding.
+static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
+                                      SDValue Input, uint64_t Elems,
+                                      uint64_t CorrectElems) {
+  SDLoc dl(N);
+
+  unsigned NumElems = Input.getValueType().getVectorNumElements();
+  SmallVector<int, 16> ShuffleMask(NumElems, -1);
+
+  // Knowing the element indices being extracted from the original
+  // vector and the order in which they're being inserted, just put
+  // them at element indices required for the instruction.
+  for (unsigned i = 0; i < N->getNumOperands(); i++) {
+    if (DAG.getDataLayout().isLittleEndian())
+      ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
+    else
+      ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
+    CorrectElems = CorrectElems >> 8;
+    Elems = Elems >> 8;
+  }
+
+  SDValue Shuffle =
+      DAG.getVectorShuffle(Input.getValueType(), dl, Input,
+                           DAG.getUNDEF(Input.getValueType()), ShuffleMask);
+
+  EVT Ty = N->getValueType(0);
+  SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
+  return BV;
+}
+
+// Look for build vector patterns where input operands come from sign
+// extended vector_extract elements of specific indices. If the correct indices
+// aren't used, add a vector shuffle to fix up the indices and create a new
+// PPCISD:SExtVElems node which selects the vector sign extend instructions
+// during instruction selection.
+static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
+  // This array encodes the indices that the vector sign extend instructions
+  // extract from when extending from one type to another for both BE and LE.
+  // The right nibble of each byte corresponds to the LE incides.
+  // and the left nibble of each byte corresponds to the BE incides.
+  // For example: 0x3074B8FC  byte->word
+  // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
+  // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
+  // For example: 0x000070F8  byte->double word
+  // For LE: the allowed indices are: 0x0,0x8
+  // For BE: the allowed indices are: 0x7,0xF
+  uint64_t TargetElems[] = {
+      0x3074B8FC, // b->w
+      0x000070F8, // b->d
+      0x10325476, // h->w
+      0x00003074, // h->d
+      0x00001032, // w->d
+  };
+
+  uint64_t Elems = 0;
+  int Index;
+  SDValue Input;
+
+  auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
+    if (!Op)
+      return false;
+    if (Op.getOpcode() != ISD::SIGN_EXTEND)
+      return false;
+
+    SDValue Extract = Op.getOperand(0);
+    if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+    if (!ExtOp)
+      return false;
+
+    Index = ExtOp->getZExtValue();
+    if (Input && Input != Extract.getOperand(0))
+      return false;
+
+    if (!Input)
+      Input = Extract.getOperand(0);
+
+    Elems = Elems << 8;
+    Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
+    Elems |= Index;
+
+    return true;
+  };
+
+  // If the build vector operands aren't sign extended vector extracts,
+  // of the same input vector, then return.
+  for (unsigned i = 0; i < N->getNumOperands(); i++) {
+    if (!isSExtOfVecExtract(N->getOperand(i))) {
+      return SDValue();
+    }
+  }
+
+  // If the vector extract indicies are not correct, add the appropriate
+  // vector_shuffle.
+  int TgtElemArrayIdx;
+  int InputSize = Input.getValueType().getScalarSizeInBits();
+  int OutputSize = N->getValueType(0).getScalarSizeInBits();
+  if (InputSize + OutputSize == 40)
+    TgtElemArrayIdx = 0;
+  else if (InputSize + OutputSize == 72)
+    TgtElemArrayIdx = 1;
+  else if (InputSize + OutputSize == 48)
+    TgtElemArrayIdx = 2;
+  else if (InputSize + OutputSize == 80)
+    TgtElemArrayIdx = 3;
+  else if (InputSize + OutputSize == 96)
+    TgtElemArrayIdx = 4;
+  else
+    return SDValue();
+
+  uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
+  CorrectElems = DAG.getDataLayout().isLittleEndian()
+                     ? CorrectElems & 0x0F0F0F0F0F0F0F0F
+                     : CorrectElems & 0xF0F0F0F0F0F0F0F0;
+  if (Elems != CorrectElems) {
+    return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
+  }
+
+  // Regular lowering will catch cases where a shuffle is not needed.
+  return SDValue();
+}
+
 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::BUILD_VECTOR &&
@@ -11338,6 +11465,15 @@ SDValue PPCTargetLowering::DAGCombineBui
   if (Reduced)
     return Reduced;
 
+  // If we're building a vector out of extended elements from another vector
+  // we have P9 vector integer extend instructions.
+  if (Subtarget.hasP9Altivec()) {
+    Reduced = combineBVOfVecSExt(N, DAG);
+    if (Reduced)
+      return Reduced;
+  }
+
+
   if (N->getValueType(0) != MVT::v2f64)
     return SDValue();
 

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h?rev=307169&r1=307168&r2=307169&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h Wed Jul  5 09:00:38 2017
@@ -67,6 +67,10 @@ namespace llvm {
       /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
       VEXTS,
 
+      /// SExtVElems, takes an input vector of a smaller type and sign
+      /// extends to an output vector of a larger type.
+      SExtVElems,
+
       /// Reciprocal estimate instructions (unary FP ops).
       FRE, FRSQRTE,
 

Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td?rev=307169&r1=307168&r2=307169&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td Wed Jul  5 09:00:38 2017
@@ -32,6 +32,9 @@ def SDT_PPCstxsix : SDTypeProfile<0, 3,
 def SDT_PPCVexts  : SDTypeProfile<1, 2, [
   SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
 ]>;
+def SDT_PPCSExtVElems  : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisVec<1>
+]>;
 
 def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
                                            SDTCisVT<1, i32> ]>;
@@ -131,6 +134,7 @@ def PPClxsizx : SDNode<"PPCISD::LXSIZX",
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
                        [SDNPHasChain, SDNPMayStore]>;
 def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
+def PPCSExtVElems  : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>;
 
 // Extract FPSCR (not modeled at the DAG level).
 def PPCmffs   : SDNode<"PPCISD::MFFS",

Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td?rev=307169&r1=307168&r2=307169&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td Wed Jul  5 09:00:38 2017
@@ -2729,36 +2729,54 @@ def DblToFlt {
 }
 
 def ByteToWord {
-  dag A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
-  dag A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
-  dag A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
-  dag A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
+  dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
+  dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
+  dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
+  dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
+  dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8));
+  dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8));
+  dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8));
+  dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8));
 }
 
 def ByteToDWord {
-  dag A0 = (i64 (sext_inreg
-            (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
-  dag A1 = (i64 (sext_inreg
-            (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
+  dag LE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
+  dag LE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
+  dag BE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8));
+  dag BE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8));
 }
 
 def HWordToWord {
-  dag A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
-  dag A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
-  dag A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
-  dag A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
+  dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
+  dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
+  dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
+  dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
+  dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16));
+  dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16));
+  dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16));
+  dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16));
 }
 
 def HWordToDWord {
-  dag A0 = (i64 (sext_inreg
-            (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
-  dag A1 = (i64 (sext_inreg
-            (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
+  dag LE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
+  dag LE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
+  dag BE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16));
+  dag BE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16));
 }
 
 def WordToDWord {
-  dag A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
-  dag A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
+  dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
+  dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
+  dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1))));
+  dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3))));
 }
 
 def FltToIntLoad {
@@ -3016,18 +3034,46 @@ let AddedComplexity = 400 in {
   // P9 Altivec instructions that can be used to build vectors.
   // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
   // with complexities of existing build vector patterns in this file.
-  let Predicates = [HasP9Altivec] in {
-    def : Pat<(v2i64 (build_vector WordToDWord.A0, WordToDWord.A1)),
+  let Predicates = [HasP9Altivec, IsLittleEndian] in {
+    def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)),
+              (v2i64 (VEXTSW2D $A))>;
+    def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)),
+              (v2i64 (VEXTSH2D $A))>;
+    def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1,
+                      HWordToWord.LE_A2, HWordToWord.LE_A3)),
+              (v4i32 (VEXTSH2W $A))>;
+    def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1,
+                      ByteToWord.LE_A2, ByteToWord.LE_A3)),
+              (v4i32 (VEXTSB2W $A))>;
+    def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
+              (v2i64 (VEXTSB2D $A))>;
+  }
+
+  let Predicates = [HasP9Altivec, IsBigEndian] in {
+    def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)),
               (v2i64 (VEXTSW2D $A))>;
-    def : Pat<(v2i64 (build_vector HWordToDWord.A0, HWordToDWord.A1)),
+    def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)),
               (v2i64 (VEXTSH2D $A))>;
-    def : Pat<(v4i32 (build_vector HWordToWord.A0, HWordToWord.A1,
-                      HWordToWord.A2, HWordToWord.A3)),
+    def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1,
+                      HWordToWord.BE_A2, HWordToWord.BE_A3)),
               (v4i32 (VEXTSH2W $A))>;
-    def : Pat<(v4i32 (build_vector ByteToWord.A0, ByteToWord.A1,
-                      ByteToWord.A2, ByteToWord.A3)),
+    def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
+                      ByteToWord.BE_A2, ByteToWord.BE_A3)),
               (v4i32 (VEXTSB2W $A))>;
-    def : Pat<(v2i64 (build_vector ByteToDWord.A0, ByteToDWord.A1)),
+    def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
               (v2i64 (VEXTSB2D $A))>;
   }
+
+  let Predicates = [HasP9Altivec] in {
+    def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)),
+              (v2i64 (VEXTSB2D $A))>;
+    def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)),
+              (v2i64 (VEXTSH2D $A))>;
+    def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)),
+              (v2i64 (VEXTSW2D $A))>;
+    def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)),
+              (v4i32 (VEXTSB2W $A))>;
+    def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)),
+              (v4i32 (VEXTSH2W $A))>;
+  }
 }

Modified: llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll?rev=307169&r1=307168&r2=307169&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll Wed Jul  5 09:00:38 2017
@@ -1,12 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 < %s | FileCheck %s -check-prefix=PWR9
-target triple = "powerpc64le-unknown-linux-gnu"
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE
+
+define <4 x i32> @vextsb2wLE(<16 x i8> %a) {
+; CHECK-LE-LABEL: vextsb2wLE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vextsb2w 2, 2
+; CHECK-LE-NEXT:    blr
+; CHECK-BE-LABEL: vextsb2wLE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE:         vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    vextsb2w 2, 2
+; CHECK-BE-NEXT:    blr
 
-define <4 x i32> @vextsb2w(<16 x i8> %a) {
-; PWR9-LABEL: vextsb2w:
-; PWR9:       # BB#0: # %entry
-; PWR9-NEXT:    vextsb2w 2, 2
-; PWR9-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
   %conv = sext i8 %vecext to i32
@@ -23,11 +29,17 @@ entry:
   ret <4 x i32> %vecinit9
 }
 
-define <2 x i64> @vextsb2d(<16 x i8> %a) {
-; PWR9-LABEL: vextsb2d:
-; PWR9:       # BB#0: # %entry
-; PWR9-NEXT:    vextsb2d 2, 2
-; PWR9-NEXT:    blr
+define <2 x i64> @vextsb2dLE(<16 x i8> %a) {
+; CHECK-LE-LABEL: vextsb2dLE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vextsb2d 2, 2
+; CHECK-LE-NEXT:    blr
+; CHECK-BE-LABEL: vextsb2dLE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE:         vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    vextsb2d 2, 2
+; CHECK-BE-NEXT:    blr
+
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
   %conv = sext i8 %vecext to i64
@@ -38,11 +50,17 @@ entry:
   ret <2 x i64> %vecinit3
 }
 
-define <4 x i32> @vextsh2w(<8 x i16> %a) {
-; PWR9-LABEL: vextsh2w:
-; PWR9:       # BB#0: # %entry
-; PWR9-NEXT:    vextsh2w 2, 2
-; PWR9-NEXT:    blr
+define <4 x i32> @vextsh2wLE(<8 x i16> %a) {
+; CHECK-LE-LABEL: vextsh2wLE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vextsh2w 2, 2
+; CHECK-LE-NEXT:    blr
+; CHECK-BE-LABEL: vextsh2wLE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE:         vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    vextsh2w 2, 2
+; CHECK-BE-NEXT:    blr
+
 entry:
   %vecext = extractelement <8 x i16> %a, i32 0
   %conv = sext i16 %vecext to i32
@@ -59,11 +77,17 @@ entry:
   ret <4 x i32> %vecinit9
 }
 
-define <2 x i64> @vextsh2d(<8 x i16> %a) {
-; PWR9-LABEL: vextsh2d:
-; PWR9:       # BB#0: # %entry
-; PWR9-NEXT:    vextsh2d 2, 2
-; PWR9-NEXT:    blr
+define <2 x i64> @vextsh2dLE(<8 x i16> %a) {
+; CHECK-LE-LABEL: vextsh2dLE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vextsh2d 2, 2
+; CHECK-LE-NEXT:    blr
+; CHECK-BE-LABEL: vextsh2dLE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE:         vperm 2, 2, 2, 3
+; CHECK-BE-NEXT:    vextsh2d 2, 2
+; CHECK-BE-NEXT:    blr
+
 entry:
   %vecext = extractelement <8 x i16> %a, i32 0
   %conv = sext i16 %vecext to i64
@@ -74,11 +98,17 @@ entry:
   ret <2 x i64> %vecinit3
 }
 
-define <2 x i64> @vextsw2d(<4 x i32> %a) {
-; PWR9-LABEL: vextsw2d:
-; PWR9:       # BB#0: # %entry
-; PWR9-NEXT:    vextsw2d 2, 2
-; PWR9-NEXT:    blr
+define <2 x i64> @vextsw2dLE(<4 x i32> %a) {
+; CHECK-LE-LABEL: vextsw2dLE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vextsw2d 2, 2
+; CHECK-LE-NEXT:    blr
+; CHECK-BE-LABEL: vextsw2dLE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE:         vmrgew
+; CHECK-BE-NEXT:    vextsw2d 2, 2
+; CHECK-BE-NEXT:    blr
+
 entry:
   %vecext = extractelement <4 x i32> %a, i32 0
   %conv = sext i32 %vecext to i64
@@ -88,3 +118,170 @@ entry:
   %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
   ret <2 x i64> %vecinit3
 }
+
+define <4 x i32> @vextsb2wBE(<16 x i8> %a) {
+; CHECK-BE-LABEL: vextsb2wBE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NEXT:    vextsb2w 2, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-LE-LABEL: vextsb2wBE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vsldoi 2, 2, 2, 13
+; CHECK-LE-NEXT:    vextsb2w 2, 2
+; CHECK-LE-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 3
+  %conv = sext i8 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <16 x i8> %a, i32 7
+  %conv2 = sext i8 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <16 x i8> %a, i32 11
+  %conv5 = sext i8 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <16 x i8> %a, i32 15
+  %conv8 = sext i8 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsb2dBE(<16 x i8> %a) {
+; CHECK-BE-LABEL: vextsb2dBE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NEXT:    vextsb2d 2, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-LE-LABEL: vextsb2dBE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vsldoi 2, 2, 2, 9
+; CHECK-LE-NEXT:    vextsb2d 2, 2
+; CHECK-LE-NEXT:    blr
+entry:
+  %vecext = extractelement <16 x i8> %a, i32 7
+  %conv = sext i8 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <16 x i8> %a, i32 15
+  %conv2 = sext i8 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
+
+define <4 x i32> @vextsh2wBE(<8 x i16> %a) {
+; CHECK-BE-LABEL: vextsh2wBE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NEXT:    vextsh2w 2, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-LE-LABEL: vextsh2wBE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vsldoi 2, 2, 2, 14
+; CHECK-LE-NEXT:    vextsh2w 2, 2
+; CHECK-LE-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 1
+  %conv = sext i16 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <8 x i16> %a, i32 3
+  %conv2 = sext i16 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <8 x i16> %a, i32 5
+  %conv5 = sext i16 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <8 x i16> %a, i32 7
+  %conv8 = sext i16 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}
+
+define <2 x i64> @vextsh2dBE(<8 x i16> %a) {
+; CHECK-BE-LABEL: vextsh2dBE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NEXT:    vextsh2d 2, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-LE-LABEL: vextsh2dBE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vsldoi 2, 2, 2, 10
+; CHECK-LE-NEXT:    vextsh2d 2, 2
+; CHECK-LE-NEXT:    blr
+entry:
+  %vecext = extractelement <8 x i16> %a, i32 3
+  %conv = sext i16 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <8 x i16> %a, i32 7
+  %conv2 = sext i16 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextsw2dBE(<4 x i32> %a) {
+; CHECK-BE-LABEL: vextsw2dBE:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NEXT:    vextsw2d 2, 2
+; CHECK-BE-NEXT:    blr
+; CHECK-LE-LABEL: vextsw2dBE:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NEXT:    vsldoi 2, 2, 2, 12
+; CHECK-LE-NEXT:    vextsw2d 2, 2
+; CHECK-LE-NEXT:    blr
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 1
+  %conv = sext i32 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %a, i32 3
+  %conv2 = sext i32 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
+
+define <2 x i64> @vextDiffVectors(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LE-LABEL: vextDiffVectors:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NOT:     vextsw2d
+
+; CHECK-BE-LABEL: vextDiffVectors:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NOT:     vextsw2d
+entry:
+  %vecext = extractelement <4 x i32> %a, i32 0
+  %conv = sext i32 %vecext to i64
+  %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %b, i32 2
+  %conv2 = sext i32 %vecext1 to i64
+  %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1
+  ret <2 x i64> %vecinit3
+}
+
+define <8 x i16> @testInvalidExtend(<16 x i8> %a) {
+entry:
+; CHECK-LE-LABEL: testInvalidExtend:
+; CHECK-LE:       # BB#0: # %entry
+; CHECK-LE-NOT:     vexts
+
+; CHECK-BE-LABEL: testInvalidExtend:
+; CHECK-BE:       # BB#0: # %entry
+; CHECK-BE-NOT:     vexts
+
+  %vecext = extractelement <16 x i8> %a, i32 0
+  %conv = sext i8 %vecext to i16
+  %vecinit = insertelement <8 x i16> undef, i16 %conv, i32 0
+  %vecext1 = extractelement <16 x i8> %a, i32 2
+  %conv2 = sext i8 %vecext1 to i16
+  %vecinit3 = insertelement <8 x i16> %vecinit, i16 %conv2, i32 1
+  %vecext4 = extractelement <16 x i8> %a, i32 4
+  %conv5 = sext i8 %vecext4 to i16
+  %vecinit6 = insertelement <8 x i16> %vecinit3, i16 %conv5, i32 2
+  %vecext7 = extractelement <16 x i8> %a, i32 6
+  %conv8 = sext i8 %vecext7 to i16
+  %vecinit9 = insertelement <8 x i16> %vecinit6, i16 %conv8, i32 3
+  %vecext10 = extractelement <16 x i8> %a, i32 8
+  %conv11 = sext i8 %vecext10 to i16
+  %vecinit12 = insertelement <8 x i16> %vecinit9, i16 %conv11, i32 4
+  %vecext13 = extractelement <16 x i8> %a, i32 10
+  %conv14 = sext i8 %vecext13 to i16
+  %vecinit15 = insertelement <8 x i16> %vecinit12, i16 %conv14, i32 5
+  %vecext16 = extractelement <16 x i8> %a, i32 12
+  %conv17 = sext i8 %vecext16 to i16
+  %vecinit18 = insertelement <8 x i16> %vecinit15, i16 %conv17, i32 6
+  %vecext19 = extractelement <16 x i8> %a, i32 14
+  %conv20 = sext i8 %vecext19 to i16
+  %vecinit21 = insertelement <8 x i16> %vecinit18, i16 %conv20, i32 7
+  ret <8 x i16> %vecinit21
+}