[llvm] r353724 - [PowerPC] Avoid scalarization of vector truncate

Roland Froese via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 11 09:29:14 PST 2019


Author: froese
Date: Mon Feb 11 09:29:14 2019
New Revision: 353724

URL: http://llvm.org/viewvc/llvm-project?rev=353724&view=rev
Log:
[PowerPC] Avoid scalarization of vector truncate

The PowerPC code generator currently scalarizes vector truncates that would fit in a vector register, resulting in vector extracts, scalar operations, and vector merges. This patch custom lowers a vector truncate that would fit in a register to a vector shuffle instead.

Differential Revision: https://reviews.llvm.org/D56507

Modified:
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
    llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=353724&r1=353723&r2=353724&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Mon Feb 11 09:29:14 2019
@@ -118,6 +118,8 @@ STATISTIC(NumSiblingCalls, "Number of si
 
 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
 
+static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
+
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
@@ -639,6 +641,14 @@ PPCTargetLowering::PPCTargetLowering(con
     // with merges, splats, etc.
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
 
+    // Vector truncates to sub-word integer that fit in an Altivec/VSX register
+    // are cheap, so handle them before they get expanded to scalar.
+    setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+
     setOperationAction(ISD::AND   , MVT::v4i32, Legal);
     setOperationAction(ISD::OR    , MVT::v4i32, Legal);
     setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
@@ -6794,6 +6804,61 @@ SDValue PPCTargetLowering::LowerTRUNCATE
                      Op.getOperand(0));
 }
 
+SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
+                                               SelectionDAG &DAG) const {
+
+  // Implements a vector truncate that fits in a vector register as a shuffle.
+  // We want to legalize vector truncates down to where the source fits in
+  // a vector register (and target is therefore smaller than vector register
+  // size).  At that point legalization will try to custom lower the sub-legal
+  // result and get here - where we can contain the truncate as a single target
+  // operation.
+
+  // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
+  //   <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
+  //
+  // We will implement it for big-endian ordering as this (where x denotes
+  // undefined):
+  //   < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
+  //   < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
+  // 
+  // The same operation in little-endian ordering will be:
+  //   <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
+  //   <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
+
+  assert(Op.getValueType().isVector() && "Vector type expected.");
+
+  SDLoc DL(Op);
+  SDValue N1 = Op.getOperand(0);
+  unsigned SrcSize = N1.getValueType().getSizeInBits();
+  assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
+  SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
+
+  EVT TrgVT = Op.getValueType();
+  unsigned TrgNumElts = TrgVT.getVectorNumElements();
+  EVT EltVT = TrgVT.getVectorElementType();
+  unsigned WideNumElts = 128 / EltVT.getSizeInBits();
+  EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
+
+  // First list the elements we want to keep.
+  unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
+  SmallVector<int, 16> ShuffV;
+  if (Subtarget.isLittleEndian())
+    for (unsigned i = 0; i < TrgNumElts; ++i)
+      ShuffV.push_back(i * SizeMult);
+  else
+    for (unsigned i = 1; i <= TrgNumElts; ++i)
+      ShuffV.push_back(i * SizeMult - 1);
+
+  // Populate the remaining elements with undefs.
+  for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
+    // ShuffV.push_back(i + WideNumElts);
+    ShuffV.push_back(WideNumElts + 1);
+
+  SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
+  return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
+}
+
 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
 /// possible.
 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -9641,6 +9706,14 @@ void PPCTargetLowering::ReplaceNodeResul
       return;
     Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
+  case ISD::TRUNCATE: {
+    EVT TrgVT = N->getValueType(0);
+    if (TrgVT.isVector() &&
+        isOperationCustom(N->getOpcode(), TrgVT) &&
+        N->getOperand(0).getValueType().getSizeInBits() <= 128)
+      Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
+    return;
+  }
   case ISD::BITCAST:
     // Don't handle bitcast here.
     return;

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h?rev=353724&r1=353723&r2=353724&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h Mon Feb 11 09:29:14 2019
@@ -952,6 +952,8 @@ namespace llvm {
     SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
                                  const SDLoc &dl) const;
 
+    SDValue LowerTRUNCATEVector(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
 

Modified: llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll?rev=353724&r1=353723&r2=353724&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll Mon Feb 11 09:29:14 2019
@@ -10,90 +10,17 @@ define void @test8i8(<8 x i8>* nocapture
 ; CHECK-LABEL: test8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lvx v2, 0, r4
-; CHECK-NEXT:    mfvsrd r4, v2
-; CHECK-NEXT:    xxswapd vs0, v2
-; CHECK-NEXT:    clrldi r5, r4, 48
-; CHECK-NEXT:    mtvsrd f1, r5
-; CHECK-NEXT:    rldicl r5, r4, 48, 48
-; CHECK-NEXT:    mtvsrd f2, r5
-; CHECK-NEXT:    rldicl r5, r4, 32, 48
-; CHECK-NEXT:    rldicl r4, r4, 16, 48
-; CHECK-NEXT:    mtvsrd f3, r5
-; CHECK-NEXT:    xxswapd v2, vs1
-; CHECK-NEXT:    mfvsrd r5, f0
-; CHECK-NEXT:    xxswapd v3, vs2
-; CHECK-NEXT:    mtvsrd f0, r4
-; CHECK-NEXT:    clrldi r4, r5, 48
-; CHECK-NEXT:    mtvsrd f1, r4
-; CHECK-NEXT:    rldicl r4, r5, 48, 48
-; CHECK-NEXT:    xxswapd v4, vs0
-; CHECK-NEXT:    mtvsrd f2, r4
-; CHECK-NEXT:    rldicl r4, r5, 32, 48
-; CHECK-NEXT:    rldicl r5, r5, 16, 48
-; CHECK-NEXT:    vmrglb v2, v3, v2
-; CHECK-NEXT:    xxswapd v3, vs3
-; CHECK-NEXT:    mtvsrd f3, r4
-; CHECK-NEXT:    xxswapd v5, vs1
-; CHECK-NEXT:    mtvsrd f0, r5
-; CHECK-NEXT:    xxswapd v0, vs2
-; CHECK-NEXT:    xxswapd v1, vs3
-; CHECK-NEXT:    vmrglb v3, v4, v3
-; CHECK-NEXT:    xxswapd v6, vs0
-; CHECK-NEXT:    vmrglb v4, v0, v5
-; CHECK-NEXT:    vmrglb v5, v6, v1
-; CHECK-NEXT:    vmrglh v2, v3, v2
-; CHECK-NEXT:    vmrglh v3, v5, v4
-; CHECK-NEXT:    vmrglw v2, v2, v3
+; CHECK-NEXT:    vpkuhum v2, v2, v2
 ; CHECK-NEXT:    xxswapd vs0, v2
 ; CHECK-NEXT:    stfdx f0, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test8i8:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    lhz r4, -18(r1)
-; CHECK-BE-NEXT:    stb r4, -48(r1)
-; CHECK-BE-NEXT:    lhz r4, -20(r1)
-; CHECK-BE-NEXT:    stb r4, -64(r1)
-; CHECK-BE-NEXT:    lhz r4, -22(r1)
-; CHECK-BE-NEXT:    stb r4, -80(r1)
-; CHECK-BE-NEXT:    lhz r4, -24(r1)
-; CHECK-BE-NEXT:    stb r4, -96(r1)
-; CHECK-BE-NEXT:    lhz r4, -26(r1)
-; CHECK-BE-NEXT:    stb r4, -112(r1)
-; CHECK-BE-NEXT:    lhz r4, -28(r1)
-; CHECK-BE-NEXT:    stb r4, -128(r1)
-; CHECK-BE-NEXT:    lhz r4, -30(r1)
-; CHECK-BE-NEXT:    stb r4, -144(r1)
-; CHECK-BE-NEXT:    lhz r4, -32(r1)
-; CHECK-BE-NEXT:    stb r4, -160(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
-; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -80
-; CHECK-BE-NEXT:    lxvw4x v4, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -96
-; CHECK-BE-NEXT:    lxvw4x v5, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -112
-; CHECK-BE-NEXT:    lxvw4x v0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -128
-; CHECK-BE-NEXT:    lxvw4x v1, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -144
-; CHECK-BE-NEXT:    lxvw4x v6, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -160
-; CHECK-BE-NEXT:    lxvw4x v7, 0, r4
-; CHECK-BE-NEXT:    vmrghb v2, v3, v2
-; CHECK-BE-NEXT:    vmrghb v3, v5, v4
-; CHECK-BE-NEXT:    vmrghb v4, v1, v0
-; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghh v2, v3, v2
-; CHECK-BE-NEXT:    vmrghb v5, v7, v6
-; CHECK-BE-NEXT:    vmrghh v3, v5, v4
-; CHECK-BE-NEXT:    vmrghw v2, v3, v2
-; CHECK-BE-NEXT:    stxvd2x v2, 0, r4
+; CHECK-BE-NEXT:    addi r5, r1, -16
+; CHECK-BE-NEXT:    vpkuhum v2, v2, v2
+; CHECK-BE-NEXT:    stxvd2x v2, 0, r5
 ; CHECK-BE-NEXT:    ld r4, -16(r1)
 ; CHECK-BE-NEXT:    std r4, 0(r3)
 ; CHECK-BE-NEXT:    blr
@@ -108,53 +35,17 @@ define void @test4i8(<4 x i8>* nocapture
 ; CHECK-LABEL: test4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lvx v2, 0, r4
-; CHECK-NEXT:    xxswapd vs0, v2
-; CHECK-NEXT:    mfvsrd r4, f0
-; CHECK-NEXT:    clrldi r5, r4, 48
-; CHECK-NEXT:    mtvsrd f0, r5
-; CHECK-NEXT:    rldicl r5, r4, 48, 48
-; CHECK-NEXT:    mtvsrd f1, r5
-; CHECK-NEXT:    rldicl r5, r4, 32, 48
-; CHECK-NEXT:    rldicl r4, r4, 16, 48
-; CHECK-NEXT:    mtvsrd f2, r5
-; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    mtvsrd f3, r4
-; CHECK-NEXT:    xxswapd v3, vs1
-; CHECK-NEXT:    xxswapd v4, vs2
-; CHECK-NEXT:    xxswapd v5, vs3
-; CHECK-NEXT:    vmrglb v2, v3, v2
-; CHECK-NEXT:    vmrglb v3, v5, v4
-; CHECK-NEXT:    vmrglh v2, v3, v2
+; CHECK-NEXT:    vpkuhum v2, v2, v2
 ; CHECK-NEXT:    xxsldwi vs0, v2, v2, 2
 ; CHECK-NEXT:    stfiwx f0, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test4i8:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    lhz r4, -26(r1)
-; CHECK-BE-NEXT:    stb r4, -48(r1)
-; CHECK-BE-NEXT:    lhz r4, -28(r1)
-; CHECK-BE-NEXT:    stb r4, -64(r1)
-; CHECK-BE-NEXT:    lhz r4, -30(r1)
-; CHECK-BE-NEXT:    stb r4, -80(r1)
-; CHECK-BE-NEXT:    lhz r4, -32(r1)
-; CHECK-BE-NEXT:    stb r4, -96(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
-; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -80
-; CHECK-BE-NEXT:    lxvw4x v4, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -96
-; CHECK-BE-NEXT:    lxvw4x v5, 0, r4
-; CHECK-BE-NEXT:    vmrghb v2, v3, v2
-; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghb v3, v5, v4
-; CHECK-BE-NEXT:    vmrghh v2, v3, v2
-; CHECK-BE-NEXT:    stxvw4x v2, 0, r4
+; CHECK-BE-NEXT:    addi r5, r1, -16
+; CHECK-BE-NEXT:    vpkuhum v2, v2, v2
+; CHECK-BE-NEXT:    stxvw4x v2, 0, r5
 ; CHECK-BE-NEXT:    lwz r4, -16(r1)
 ; CHECK-BE-NEXT:    stw r4, 0(r3)
 ; CHECK-BE-NEXT:    blr
@@ -168,54 +59,23 @@ entry:
 define void @test4i8w(<4 x i8>* nocapture %Sink, <4 x i32>* nocapture readonly %SrcPtr) {
 ; CHECK-LABEL: test4i8w:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lvx v2, 0, r4
-; CHECK-NEXT:    xxswapd vs0, v2
-; CHECK-NEXT:    mfvsrwz r4, v2
-; CHECK-NEXT:    xxsldwi vs1, v2, v2, 1
-; CHECK-NEXT:    xxsldwi vs3, v2, v2, 3
-; CHECK-NEXT:    mtvsrd f2, r4
-; CHECK-NEXT:    mfvsrwz r4, f0
-; CHECK-NEXT:    mfvsrwz r5, f1
-; CHECK-NEXT:    xxswapd v4, vs2
-; CHECK-NEXT:    mtvsrd f0, r4
-; CHECK-NEXT:    mfvsrwz r4, f3
-; CHECK-NEXT:    mtvsrd f1, r5
-; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    mtvsrd f3, r4
-; CHECK-NEXT:    xxswapd v3, vs1
-; CHECK-NEXT:    xxswapd v5, vs3
-; CHECK-NEXT:    vmrglb v2, v3, v2
-; CHECK-NEXT:    vmrglb v3, v5, v4
-; CHECK-NEXT:    vmrglh v2, v3, v2
+; CHECK-NEXT:    addis r5, r2, .LCPI2_0 at toc@ha
+; CHECK-NEXT:    lvx v3, 0, r4
+; CHECK-NEXT:    addi r5, r5, .LCPI2_0 at toc@l
+; CHECK-NEXT:    lvx v2, 0, r5
+; CHECK-NEXT:    vperm v2, v3, v3, v2
 ; CHECK-NEXT:    xxsldwi vs0, v2, v2, 2
 ; CHECK-NEXT:    stfiwx f0, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test4i8w:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    lwz r4, -20(r1)
-; CHECK-BE-NEXT:    stb r4, -48(r1)
-; CHECK-BE-NEXT:    lwz r4, -24(r1)
-; CHECK-BE-NEXT:    stb r4, -64(r1)
-; CHECK-BE-NEXT:    lwz r4, -28(r1)
-; CHECK-BE-NEXT:    stb r4, -80(r1)
-; CHECK-BE-NEXT:    lwz r4, -32(r1)
-; CHECK-BE-NEXT:    stb r4, -96(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI2_0 at toc@ha
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
+; CHECK-BE-NEXT:    addi r4, r5, .LCPI2_0 at toc@l
 ; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -80
-; CHECK-BE-NEXT:    lxvw4x v4, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -96
-; CHECK-BE-NEXT:    lxvw4x v5, 0, r4
-; CHECK-BE-NEXT:    vmrghb v2, v3, v2
 ; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghb v3, v5, v4
-; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-BE-NEXT:    stxvw4x v2, 0, r4
 ; CHECK-BE-NEXT:    lwz r4, -16(r1)
 ; CHECK-BE-NEXT:    stw r4, 0(r3)
@@ -231,15 +91,7 @@ define void @test2i8(<2 x i8>* nocapture
 ; CHECK-LABEL: test2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lvx v2, 0, r4
-; CHECK-NEXT:    xxswapd vs0, v2
-; CHECK-NEXT:    mfvsrd r4, f0
-; CHECK-NEXT:    clrldi r5, r4, 48
-; CHECK-NEXT:    rldicl r4, r4, 48, 48
-; CHECK-NEXT:    mtvsrd f0, r5
-; CHECK-NEXT:    mtvsrd f1, r4
-; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    xxswapd v3, vs1
-; CHECK-NEXT:    vmrglb v2, v3, v2
+; CHECK-NEXT:    vpkuhum v2, v2, v2
 ; CHECK-NEXT:    xxswapd vs0, v2
 ; CHECK-NEXT:    mfvsrd r4, f0
 ; CHECK-NEXT:    clrldi r4, r4, 48
@@ -248,20 +100,10 @@ define void @test2i8(<2 x i8>* nocapture
 ;
 ; CHECK-BE-LABEL: test2i8:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    lhz r4, -30(r1)
-; CHECK-BE-NEXT:    stb r4, -48(r1)
-; CHECK-BE-NEXT:    lhz r4, -32(r1)
-; CHECK-BE-NEXT:    stb r4, -64(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
-; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghb v2, v3, v2
-; CHECK-BE-NEXT:    stxvw4x v2, 0, r4
+; CHECK-BE-NEXT:    addi r5, r1, -16
+; CHECK-BE-NEXT:    vpkuhum v2, v2, v2
+; CHECK-BE-NEXT:    stxvw4x v2, 0, r5
 ; CHECK-BE-NEXT:    lhz r4, -16(r1)
 ; CHECK-BE-NEXT:    sth r4, 0(r3)
 ; CHECK-BE-NEXT:    blr
@@ -276,54 +118,17 @@ define void @test4i16(<4 x i16>* nocaptu
 ; CHECK-LABEL: test4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lvx v2, 0, r4
-; CHECK-NEXT:    xxswapd vs0, v2
-; CHECK-NEXT:    mfvsrwz r4, v2
-; CHECK-NEXT:    xxsldwi vs1, v2, v2, 1
-; CHECK-NEXT:    xxsldwi vs3, v2, v2, 3
-; CHECK-NEXT:    mtvsrd f2, r4
-; CHECK-NEXT:    mfvsrwz r4, f0
-; CHECK-NEXT:    mfvsrwz r5, f1
-; CHECK-NEXT:    xxswapd v4, vs2
-; CHECK-NEXT:    mtvsrd f0, r4
-; CHECK-NEXT:    mfvsrwz r4, f3
-; CHECK-NEXT:    mtvsrd f1, r5
-; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    mtvsrd f3, r4
-; CHECK-NEXT:    xxswapd v3, vs1
-; CHECK-NEXT:    xxswapd v5, vs3
-; CHECK-NEXT:    vmrglh v2, v3, v2
-; CHECK-NEXT:    vmrglh v3, v5, v4
-; CHECK-NEXT:    vmrglw v2, v3, v2
+; CHECK-NEXT:    vpkuwum v2, v2, v2
 ; CHECK-NEXT:    xxswapd vs0, v2
 ; CHECK-NEXT:    stfdx f0, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test4i16:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    lwz r4, -20(r1)
-; CHECK-BE-NEXT:    sth r4, -48(r1)
-; CHECK-BE-NEXT:    lwz r4, -24(r1)
-; CHECK-BE-NEXT:    sth r4, -64(r1)
-; CHECK-BE-NEXT:    lwz r4, -28(r1)
-; CHECK-BE-NEXT:    sth r4, -80(r1)
-; CHECK-BE-NEXT:    lwz r4, -32(r1)
-; CHECK-BE-NEXT:    sth r4, -96(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
-; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -80
-; CHECK-BE-NEXT:    lxvw4x v4, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -96
-; CHECK-BE-NEXT:    lxvw4x v5, 0, r4
-; CHECK-BE-NEXT:    vmrghh v2, v3, v2
-; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghh v3, v5, v4
-; CHECK-BE-NEXT:    vmrghw v2, v3, v2
-; CHECK-BE-NEXT:    stxvd2x v2, 0, r4
+; CHECK-BE-NEXT:    addi r5, r1, -16
+; CHECK-BE-NEXT:    vpkuwum v2, v2, v2
+; CHECK-BE-NEXT:    stxvd2x v2, 0, r5
 ; CHECK-BE-NEXT:    ld r4, -16(r1)
 ; CHECK-BE-NEXT:    std r4, 0(r3)
 ; CHECK-BE-NEXT:    blr
@@ -338,35 +143,17 @@ define void @test2i16(<2 x i16>* nocaptu
 ; CHECK-LABEL: test2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lvx v2, 0, r4
-; CHECK-NEXT:    xxswapd vs0, v2
-; CHECK-NEXT:    xxsldwi vs1, v2, v2, 1
-; CHECK-NEXT:    mfvsrwz r4, f0
-; CHECK-NEXT:    mfvsrwz r5, f1
-; CHECK-NEXT:    mtvsrd f0, r4
-; CHECK-NEXT:    mtvsrd f1, r5
-; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    xxswapd v3, vs1
-; CHECK-NEXT:    vmrglh v2, v3, v2
+; CHECK-NEXT:    vpkuwum v2, v2, v2
 ; CHECK-NEXT:    xxsldwi vs0, v2, v2, 2
 ; CHECK-NEXT:    stfiwx f0, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test2i16:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvw4x vs0, 0, r4
-; CHECK-BE-NEXT:    lwz r4, -28(r1)
-; CHECK-BE-NEXT:    sth r4, -48(r1)
-; CHECK-BE-NEXT:    lwz r4, -32(r1)
-; CHECK-BE-NEXT:    sth r4, -64(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
-; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghh v2, v3, v2
-; CHECK-BE-NEXT:    stxvw4x v2, 0, r4
+; CHECK-BE-NEXT:    addi r5, r1, -16
+; CHECK-BE-NEXT:    vpkuwum v2, v2, v2
+; CHECK-BE-NEXT:    stxvw4x v2, 0, r5
 ; CHECK-BE-NEXT:    lwz r4, -16(r1)
 ; CHECK-BE-NEXT:    stw r4, 0(r3)
 ; CHECK-BE-NEXT:    blr
@@ -381,33 +168,23 @@ define void @test2i16d(<2 x i16>* nocapt
 ; CHECK-LABEL: test2i16d:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-NEXT:    xxswapd vs1, vs0
-; CHECK-NEXT:    mfvsrwz r4, f0
-; CHECK-NEXT:    mtvsrd f0, r4
-; CHECK-NEXT:    mfvsrwz r5, f1
+; CHECK-NEXT:    addis r5, r2, .LCPI6_0 at toc@ha
+; CHECK-NEXT:    addi r4, r5, .LCPI6_0 at toc@l
+; CHECK-NEXT:    lvx v3, 0, r4
 ; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    mtvsrd f1, r5
-; CHECK-NEXT:    xxswapd v3, vs1
-; CHECK-NEXT:    vmrglh v2, v3, v2
+; CHECK-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-NEXT:    xxsldwi vs0, v2, v2, 2
 ; CHECK-NEXT:    stfiwx f0, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test2i16d:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxvd2x vs0, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -32
-; CHECK-BE-NEXT:    stxvd2x vs0, 0, r4
-; CHECK-BE-NEXT:    lwz r4, -20(r1)
-; CHECK-BE-NEXT:    sth r4, -48(r1)
-; CHECK-BE-NEXT:    lwz r4, -28(r1)
-; CHECK-BE-NEXT:    sth r4, -64(r1)
-; CHECK-BE-NEXT:    addi r4, r1, -48
+; CHECK-BE-NEXT:    addis r5, r2, .LCPI6_0 at toc@ha
 ; CHECK-BE-NEXT:    lxvw4x v2, 0, r4
-; CHECK-BE-NEXT:    addi r4, r1, -64
+; CHECK-BE-NEXT:    addi r4, r5, .LCPI6_0 at toc@l
 ; CHECK-BE-NEXT:    lxvw4x v3, 0, r4
 ; CHECK-BE-NEXT:    addi r4, r1, -16
-; CHECK-BE-NEXT:    vmrghh v2, v3, v2
+; CHECK-BE-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-BE-NEXT:    stxvw4x v2, 0, r4
 ; CHECK-BE-NEXT:    lwz r4, -16(r1)
 ; CHECK-BE-NEXT:    stw r4, 0(r3)




More information about the llvm-commits mailing list