[llvm] r372721 - [ARM] Split large widening MVE loads

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 24 03:53:09 PDT 2019


Author: dmgreen
Date: Tue Sep 24 03:53:09 2019
New Revision: 372721

URL: http://llvm.org/viewvc/llvm-project?rev=372721&view=rev
Log:
[ARM] Split large widening MVE loads

Similar to rL372717, we can force the splitting of extends of vector loads in
MVE, in order to use the better widening loads as opposed to going through
expensive extends. This adds a combine to early-on detect extends of loads and
split the load in two, from where normal legalisation will kick in and we get a
series of widening loads.

Differential Revision: https://reviews.llvm.org/D67909

Modified:
    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
    llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll

Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=372721&r1=372720&r2=372721&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Tue Sep 24 03:53:09 2019
@@ -898,9 +898,6 @@ ARMTargetLowering::ARMTargetLowering(con
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRL);
     setTargetDAGCombine(ISD::SRA);
-    setTargetDAGCombine(ISD::SIGN_EXTEND);
-    setTargetDAGCombine(ISD::ZERO_EXTEND);
-    setTargetDAGCombine(ISD::ANY_EXTEND);
     setTargetDAGCombine(ISD::FP_TO_SINT);
     setTargetDAGCombine(ISD::FP_TO_UINT);
     setTargetDAGCombine(ISD::FDIV);
@@ -922,6 +919,9 @@ ARMTargetLowering::ARMTargetLowering(con
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     setTargetDAGCombine(ISD::STORE);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -13694,6 +13694,71 @@ static SDValue PerformShiftCombine(SDNod
   return SDValue();
 }
 
+// Look for a sign/zero extend of a larger than legal load. This can be split
+// into two extending loads, which are simpler to deal with than an arbitrary
+// sign extend.
+SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::LOAD)
+    return SDValue();
+  LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
+  if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDValue();
+  EVT FromVT = LD->getValueType(0);
+  EVT ToVT = N->getValueType(0);
+  if (!ToVT.isVector())
+    return SDValue();
+  assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+  EVT ToEltVT = ToVT.getVectorElementType();
+  EVT FromEltVT = FromVT.getVectorElementType();
+
+  unsigned NumElements = 0;
+  if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
+    NumElements = 4;
+  if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
+    NumElements = 8;
+  if (NumElements == 0 ||
+      FromVT.getVectorNumElements() == NumElements ||
+      FromVT.getVectorNumElements() % NumElements != 0 ||
+      !isPowerOf2_32(NumElements))
+    return SDValue();
+
+  SDLoc DL(LD);
+  // Details about the old load
+  SDValue Ch = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  unsigned Alignment = LD->getOriginalAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  ISD::LoadExtType NewExtType =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
+  EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
+  SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+  // Split the load in half, each side of which is extended separately. This
+  // is good enough, as legalisation will take it from there. They are either
+  // already legal or they will be split further into something that is
+  // legal.
+  SDValue NewLoad1 =
+      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
+                  LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
+  SDValue NewLoad2 =
+      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+                  LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+                  Alignment, MMOFlags, AAInfo);
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                 SDValue(NewLoad1.getNode(), 1),
+                                 SDValue(NewLoad2.getNode(), 1));
+  DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
+}
+
 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
@@ -13731,6 +13796,10 @@ static SDValue PerformExtendCombine(SDNo
     }
   }
 
+  if (ST->hasMVEIntegerOps())
+    if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
+      return NewLoad;
+
   return SDValue();
 }
 

Modified: llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll?rev=372721&r1=372720&r2=372721&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll Tue Sep 24 03:53:09 2019
@@ -134,51 +134,14 @@ entry:
 define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_int32_int8_double:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vstrw.32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vstrw.32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmovlb.s8 q0, q1
-; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vldrb.s32 q0, [r1, #4]
+; CHECK-NEXT:    vldrb.s32 q1, [r1]
+; CHECK-NEXT:    vldrb.s32 q2, [r1, #12]
+; CHECK-NEXT:    vldrb.s32 q3, [r1, #8]
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
 ; CHECK-NEXT:    bx lr
 entry:
   %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
@@ -190,42 +153,9 @@ entry:
 define void @foo_int16_int8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_int16_int8_double:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vldrb.s16 q0, [r1]
+; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
 ; CHECK-NEXT:    vstrh.16 q1, [r0, #16]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmovlb.s8 q0, q1
 ; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -238,26 +168,9 @@ entry:
 define void @foo_int32_int16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_int32_int16_double:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vldrh.s32 q0, [r1]
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmovlb.s16 q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -311,48 +224,14 @@ entry:
 define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_uint32_uint8_double:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    vmov.i32 q1, #0xff
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vand q2, q2, q1
+; CHECK-NEXT:    vldrb.u32 q0, [r1, #4]
+; CHECK-NEXT:    vldrb.u32 q1, [r1]
+; CHECK-NEXT:    vldrb.u32 q2, [r1, #12]
+; CHECK-NEXT:    vldrb.u32 q3, [r1, #8]
+; CHECK-NEXT:    vstrw.32 q1, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r0, #32]
 ; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vstrw.32 q2, [r0, #32]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vstrw.32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vand q0, q2, q1
-; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
@@ -364,42 +243,9 @@ entry:
 define void @foo_uint16_uint8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_uint16_uint8_double:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrb.u16 q1, [r1, #8]
 ; CHECK-NEXT:    vstrh.16 q1, [r0, #16]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmovlb.u8 q0, q1
 ; CHECK-NEXT:    vstrh.16 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -412,26 +258,9 @@ entry:
 define void @foo_uint32_uint16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_uint32_uint16_double:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u16 q1, q1
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrh.u32 q1, [r1, #8]
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmovlb.u16 q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -445,34 +274,18 @@ entry:
 define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_int32_int8_both:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q0, [r1]
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmovlb.s8 q1, q1
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov.32 q2[3], r1
+; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
+; CHECK-NEXT:    vmov.u16 r2, q1[4]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[5]
+; CHECK-NEXT:    vmov.32 q0[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[6]
+; CHECK-NEXT:    vmov.32 q0[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov.32 q0[3], r2
+; CHECK-NEXT:    vmovlb.u16 q2, q0
+; CHECK-NEXT:    vldrb.s16 q0, [r1]
 ; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmovlb.u16 q2, q2
 ; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
 ; CHECK-NEXT:    vmov.32 q2[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q1[1]
@@ -481,26 +294,9 @@ define void @foo_int32_int8_both(<16 x i
 ; CHECK-NEXT:    vmov.32 q2[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q1[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    vmovlb.u16 q1, q2
 ; CHECK-NEXT:    vstrw.32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vmovlb.s8 q0, q1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r1
@@ -532,27 +328,10 @@ entry:
 define <8 x i16>* @foo_uint32_uint16_double_offset(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_uint32_uint16_double_offset:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.u16 q0, [r1, #16]!
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmovlb.s16 q0, q1
+; CHECK-NEXT:    vldrh.s32 q0, [r1, #16]!
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -566,48 +345,14 @@ entry:
 define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>* readonly %src, i32 %n) {
 ; CHECK-LABEL: foo_uint32_uint16_quad_offset:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]!
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[6]
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.32 q0[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmovlb.s16 q2, q0
-; CHECK-NEXT:    vldrh.u16 q0, [r1, #16]
-; CHECK-NEXT:    vstrw.32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [r0]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vstrw.32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.32 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.32 q1[3], r2
-; CHECK-NEXT:    vmovlb.s16 q0, q1
-; CHECK-NEXT:    vstrw.32 q0, [r0, #32]
+; CHECK-NEXT:    vldrh.s32 q0, [r1, #32]!
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vldrh.s32 q2, [r1, #24]
+; CHECK-NEXT:    vldrh.s32 q3, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q3, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bx lr
 entry:




More information about the llvm-commits mailing list