[llvm] 7740216 - [DAG] Combine insert(shuffle(load), load, 0) into a single load

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed May 31 11:49:03 PDT 2023


Author: David Green
Date: 2023-05-31T19:48:57+01:00
New Revision: 7740216f2e1e6f4243a868bc41b9397ad2c7fb38

URL: https://github.com/llvm/llvm-project/commit/7740216f2e1e6f4243a868bc41b9397ad2c7fb38
DIFF: https://github.com/llvm/llvm-project/commit/7740216f2e1e6f4243a868bc41b9397ad2c7fb38.diff

LOG: [DAG] Combine insert(shuffle(load), load, 0) into a single load

Given an insert of a scalar load into a vector shuffle with mask
u,0,1,2,3,4,5,6 or 1,2,3,4,5,6,7,u (depending on the insert index),
it can be more profitable to convert to a single load and avoid the
shuffles. This adds a DAG combine for it, providing the new load is
still fast.

Differential Revision: https://reviews.llvm.org/D151029

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/test/CodeGen/AArch64/insertshuffleload.ll
    llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5fdc83c56cea5..805e4f677f12e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -604,6 +604,7 @@ namespace {
     SDValue combineRepeatedFPDivisors(SDNode *N);
     SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex);
     SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
+    SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
@@ -20952,6 +20953,99 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   return DAG.getBitcast(VT, Shuf);
 }
 
+// Combine insert(shuffle(load, <u,0,1,2>), load, 0) into a single load if
+// possible and the new load will be quick. We use more loads but less shuffles
+// and inserts.
+SDValue DAGCombiner::combineInsertEltToLoad(SDNode *N, unsigned InsIndex) {
+  EVT VT = N->getValueType(0);
+
+  // InsIndex is expected to be the first of last lane.
+  if (!VT.isFixedLengthVector() ||
+      (InsIndex != 0 && InsIndex != VT.getVectorNumElements() - 1))
+    return SDValue();
+
+  // Look for a shuffle with the mask u,0,1,2,3,4,5,6 or 1,2,3,4,5,6,7,u
+  // depending on the InsIndex.
+  auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
+  SDValue Scalar = N->getOperand(1);
+  if (!Shuffle || !all_of(enumerate(Shuffle->getMask()), [&](auto P) {
+        return InsIndex == P.index() || P.value() < 0 ||
+               (InsIndex == 0 && P.value() == (int)P.index() - 1) ||
+               (InsIndex == VT.getVectorNumElements() - 1 &&
+                P.value() == (int)P.index() + 1);
+      }))
+    return SDValue();
+
+  // We optionally skip over an extend so long as both loads are extended in the
+  // same way from the same type.
+  unsigned Extend = 0;
+  if (Scalar.getOpcode() == ISD::ZERO_EXTEND ||
+      Scalar.getOpcode() == ISD::SIGN_EXTEND ||
+      Scalar.getOpcode() == ISD::ANY_EXTEND) {
+    Extend = Scalar.getOpcode();
+    Scalar = Scalar.getOperand(0);
+  }
+
+  auto *ScalarLoad = dyn_cast<LoadSDNode>(Scalar);
+  if (!ScalarLoad)
+    return SDValue();
+
+  SDValue Vec = Shuffle->getOperand(0);
+  if (Extend) {
+    if (Vec.getOpcode() != Extend)
+      return SDValue();
+    Vec = Vec.getOperand(0);
+  }
+  auto *VecLoad = dyn_cast<LoadSDNode>(Vec);
+  if (!VecLoad || Vec.getValueType().getScalarType() != Scalar.getValueType())
+    return SDValue();
+
+  int EltSize = ScalarLoad->getValueType(0).getScalarSizeInBits();
+  if (EltSize == 0 || EltSize % 8 != 0 || !ScalarLoad->isSimple() ||
+      !VecLoad->isSimple() || VecLoad->getExtensionType() != ISD::NON_EXTLOAD ||
+      ScalarLoad->getExtensionType() != ISD::NON_EXTLOAD ||
+      ScalarLoad->getAddressSpace() != VecLoad->getAddressSpace())
+    return SDValue();
+
+  // Check that the offset between the pointers to produce a single continuous
+  // load.
+  if (InsIndex == 0) {
+    if (!DAG.areNonVolatileConsecutiveLoads(ScalarLoad, VecLoad, EltSize / 8,
+                                            -1))
+      return SDValue();
+  } else {
+    if (!DAG.areNonVolatileConsecutiveLoads(
+            VecLoad, ScalarLoad, VT.getVectorNumElements() * EltSize / 8, -1))
+      return SDValue();
+  }
+
+  // And that the new unaligned load will be fast.
+  unsigned IsFast = 0;
+  Align NewAlign = commonAlignment(VecLoad->getAlign(), EltSize / 8);
+  if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                              Vec.getValueType(), VecLoad->getAddressSpace(),
+                              NewAlign, VecLoad->getMemOperand()->getFlags(),
+                              &IsFast) ||
+      !IsFast)
+    return SDValue();
+
+  // Calculate the new Ptr and create the new load.
+  SDLoc DL(N);
+  SDValue Ptr = ScalarLoad->getBasePtr();
+  if (InsIndex != 0)
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), VecLoad->getBasePtr(),
+                      DAG.getConstant(EltSize / 8, DL, Ptr.getValueType()));
+  MachinePointerInfo PtrInfo =
+      InsIndex == 0 ? ScalarLoad->getPointerInfo()
+                    : VecLoad->getPointerInfo().getWithOffset(EltSize / 8);
+
+  SDValue Load = DAG.getLoad(VecLoad->getValueType(0), DL,
+                             ScalarLoad->getChain(), Ptr, PtrInfo, NewAlign);
+  DAG.makeEquivalentMemoryOrdering(ScalarLoad, Load.getValue(1));
+  DAG.makeEquivalentMemoryOrdering(VecLoad, Load.getValue(1));
+  return Extend ? DAG.getNode(Extend, DL, VT, Load) : Load;
+}
+
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
@@ -21023,6 +21117,9 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
     return Shuf;
 
+  if (SDValue Shuf = combineInsertEltToLoad(N, Elt))
+    return Shuf;
+
   // Attempt to convert an insert_vector_elt chain into a legal build_vector.
   if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
     // vXi1 vector - we don't need to recurse.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 777745d76e9f5..58d8ce2f1e67d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11700,7 +11700,7 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
 
   int64_t Offset = 0;
   if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
-    return (Dist * Bytes == Offset);
+    return (Dist * (int64_t)Bytes == Offset);
   return false;
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
index c9bdb9537157e..17ddd0d0b9972 100644
--- a/llvm/test/CodeGen/AArch64/insertshuffleload.ll
+++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
@@ -4,10 +4,7 @@
 define <8 x i8> @inserti8_first(ptr %p) {
 ; CHECK-LABEL: inserti8_first:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur d0, [x0, #1]
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -20,11 +17,7 @@ define <8 x i8> @inserti8_first(ptr %p) {
 define <8 x i8> @inserti8_last(ptr %p) {
 ; CHECK-LABEL: inserti8_last:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    add x8, x0, #8
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #1
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 8
   %l1 = load <8 x i8>, ptr %p
@@ -37,11 +30,8 @@ define <8 x i8> @inserti8_last(ptr %p) {
 define <8 x i16> @inserti8_first_sext(ptr %p) {
 ; CHECK-LABEL: inserti8_first_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur d0, [x0, #1]
-; CHECK-NEXT:    ldrsb w8, [x0]
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
-; CHECK-NEXT:    mov v0.h[0], w8
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -56,11 +46,8 @@ define <8 x i16> @inserti8_first_sext(ptr %p) {
 define <8 x i16> @inserti8_last_sext(ptr %p) {
 ; CHECK-LABEL: inserti8_last_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldrsb w8, [x0, #8]
+; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
-; CHECK-NEXT:    mov v0.h[7], w8
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 8
   %l1 = load <8 x i8>, ptr %p
@@ -75,11 +62,8 @@ define <8 x i16> @inserti8_last_sext(ptr %p) {
 define <8 x i16> @inserti8_first_zext(ptr %p) {
 ; CHECK-LABEL: inserti8_first_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur d0, [x0, #1]
-; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #14
-; CHECK-NEXT:    mov v0.h[0], w8
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -94,11 +78,8 @@ define <8 x i16> @inserti8_first_zext(ptr %p) {
 define <8 x i16> @inserti8_last_zext(ptr %p) {
 ; CHECK-LABEL: inserti8_last_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldrb w8, [x0, #8]
+; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
-; CHECK-NEXT:    mov v0.h[7], w8
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 8
   %l1 = load <8 x i8>, ptr %p
@@ -113,11 +94,7 @@ define <8 x i16> @inserti8_last_zext(ptr %p) {
 define <8 x i32> @inserti32_first(ptr %p) {
 ; CHECK-LABEL: inserti32_first:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur q1, [x0, #4]
-; CHECK-NEXT:    ldur q2, [x0, #20]
-; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ext v1.16b, v1.16b, v2.16b, #12
-; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <8 x i32>, ptr %q
@@ -130,11 +107,8 @@ define <8 x i32> @inserti32_first(ptr %p) {
 define <8 x i32> @inserti32_last(ptr %p) {
 ; CHECK-LABEL: inserti32_last:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
-; CHECK-NEXT:    add x8, x0, #32
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #4
-; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #4
-; CHECK-NEXT:    ld1 { v1.s }[3], [x8]
+; CHECK-NEXT:    ldur q0, [x0, #4]
+; CHECK-NEXT:    ldur q1, [x0, #20]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 32
   %l1 = load <8 x i32>, ptr %p
@@ -147,11 +121,9 @@ define <8 x i32> @inserti32_last(ptr %p) {
 define <8 x i32> @inserti32_first_multiuse(ptr %p) {
 ; CHECK-LABEL: inserti32_first_multiuse:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur q0, [x0, #4]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    ldur q1, [x0, #20]
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #12
-; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ld1 { v2.s }[0], [x0]
+; CHECK-NEXT:    ldur q0, [x0, #4]
 ; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
@@ -168,12 +140,10 @@ define <8 x i32> @inserti32_last_multiuse(ptr %p) {
 ; CHECK-LABEL: inserti32_last_multiuse:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    add x8, x0, #32
-; CHECK-NEXT:    ext v2.16b, v1.16b, v0.16b, #4
-; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #4
-; CHECK-NEXT:    ld1 { v2.s }[3], [x8]
-; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ldur q2, [x0, #4]
+; CHECK-NEXT:    ldur q3, [x0, #20]
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 32
   %l1 = load <8 x i32>, ptr %p
@@ -187,9 +157,7 @@ define <8 x i32> @inserti32_last_multiuse(ptr %p) {
 define <4 x float> @insertf32_first(ptr %p) {
 ; CHECK-LABEL: insertf32_first:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur q0, [x0, #4]
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #12
-; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <4 x float>, ptr %q
@@ -202,10 +170,7 @@ define <4 x float> @insertf32_first(ptr %p) {
 define <4 x float> @insertf32_last(ptr %p) {
 ; CHECK-LABEL: insertf32_last:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add x8, x0, #16
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
-; CHECK-NEXT:    ld1 { v0.s }[3], [x8]
+; CHECK-NEXT:    ldur q0, [x0, #4]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 16
   %l1 = load <4 x float>, ptr %p
@@ -218,9 +183,7 @@ define <4 x float> @insertf32_last(ptr %p) {
 define <2 x i64> @inserti64_first(ptr %p) {
 ; CHECK-LABEL: inserti64_first:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #8
-; CHECK-NEXT:    ld1r { v0.2d }, [x8]
-; CHECK-NEXT:    ld1 { v0.d }[0], [x0]
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 8
   %l1 = load <2 x i64>, ptr %q
@@ -233,10 +196,7 @@ define <2 x i64> @inserti64_first(ptr %p) {
 define <2 x i64> @inserti64_last(ptr %p) {
 ; CHECK-LABEL: inserti64_last:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    add x8, x0, #16
-; CHECK-NEXT:    dup v0.2d, v0.d[1]
-; CHECK-NEXT:    ld1 { v0.d }[1], [x8]
+; CHECK-NEXT:    ldur q0, [x0, #8]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 16
   %l1 = load <2 x i64>, ptr %p
@@ -249,10 +209,7 @@ define <2 x i64> @inserti64_last(ptr %p) {
 define <8 x i8> @inserti8_first_undef(ptr %p) {
 ; CHECK-LABEL: inserti8_first_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur d0, [x0, #1]
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -265,11 +222,7 @@ define <8 x i8> @inserti8_first_undef(ptr %p) {
 define <8 x i8> @inserti8_last_undef(ptr %p) {
 ; CHECK-LABEL: inserti8_last_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    add x8, x0, #8
-; CHECK-NEXT:    dup v0.8b, v0.b[1]
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 8
   %l1 = load <8 x i8>, ptr %p
@@ -445,10 +398,7 @@ define <8 x i8> @storebefore(ptr %p, ptr %r) {
 ; CHECK-LABEL: storebefore:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    strb wzr, [x1]
-; CHECK-NEXT:    ldur d0, [x0, #1]
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 1
   store i8 0, ptr %r
@@ -462,11 +412,8 @@ define <8 x i8> @storebefore(ptr %p, ptr %r) {
 define <8 x i8> @storeafter(ptr %p, ptr %r) {
 ; CHECK-LABEL: storeafter:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur d0, [x0, #1]
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #7
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    strb wzr, [x1]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q

diff  --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
index 7714f8dd92d73..5f56a82f3c511 100644
--- a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll
@@ -6,37 +6,13 @@
 define <8 x i8> @inserti8_first(ptr %p) {
 ; CHECKLE-LABEL: inserti8_first:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrb.u16 q1, [r0, #1]
-; CHECKLE-NEXT:    ldrb r1, [r0]
-; CHECKLE-NEXT:    vmovx.f16 s10, s5
-; CHECKLE-NEXT:    vmovx.f16 s8, s4
-; CHECKLE-NEXT:    vins.f16 s10, s6
-; CHECKLE-NEXT:    vmovx.f16 s6, s6
-; CHECKLE-NEXT:    vmov.16 q0[0], r1
-; CHECKLE-NEXT:    vins.f16 s8, s5
-; CHECKLE-NEXT:    vins.f16 s6, s7
-; CHECKLE-NEXT:    vmov.f32 s1, s8
-; CHECKLE-NEXT:    vmov.f32 s2, s10
-; CHECKLE-NEXT:    vins.f16 s0, s4
-; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    vldrb.u16 q0, [r0]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti8_first:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrb.u16 q0, [r0, #1]
-; CHECKBE-NEXT:    ldrb r1, [r0]
-; CHECKBE-NEXT:    vmovx.f16 s6, s1
-; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vins.f16 s6, s2
-; CHECKBE-NEXT:    vmovx.f16 s2, s2
-; CHECKBE-NEXT:    vmov.16 q2[0], r1
-; CHECKBE-NEXT:    vins.f16 s4, s1
-; CHECKBE-NEXT:    vins.f16 s2, s3
-; CHECKBE-NEXT:    vins.f16 s8, s0
-; CHECKBE-NEXT:    vmov.f32 s9, s4
-; CHECKBE-NEXT:    vmov.f32 s10, s6
-; CHECKBE-NEXT:    vmov.f32 s11, s2
-; CHECKBE-NEXT:    vrev64.16 q0, q2
+; CHECKBE-NEXT:    vldrb.u16 q1, [r0]
+; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -49,32 +25,12 @@ define <8 x i8> @inserti8_first(ptr %p) {
 define <8 x i8> @inserti8_last(ptr %p) {
 ; CHECKLE-LABEL: inserti8_last:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrb.u16 q1, [r0]
-; CHECKLE-NEXT:    ldrb r1, [r0, #8]
-; CHECKLE-NEXT:    vmovx.f16 s0, s4
-; CHECKLE-NEXT:    vmovx.f16 s1, s5
-; CHECKLE-NEXT:    vmovx.f16 s2, s6
-; CHECKLE-NEXT:    vins.f16 s0, s5
-; CHECKLE-NEXT:    vins.f16 s1, s6
-; CHECKLE-NEXT:    vins.f16 s2, s7
-; CHECKLE-NEXT:    vmov.u16 r0, q1[7]
-; CHECKLE-NEXT:    vmov.16 q0[6], r0
-; CHECKLE-NEXT:    vmov.16 q0[7], r1
+; CHECKLE-NEXT:    vldrb.u16 q0, [r0, #1]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti8_last:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrb.u16 q0, [r0]
-; CHECKBE-NEXT:    ldrb r1, [r0, #8]
-; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vmovx.f16 s5, s1
-; CHECKBE-NEXT:    vmovx.f16 s6, s2
-; CHECKBE-NEXT:    vins.f16 s4, s1
-; CHECKBE-NEXT:    vins.f16 s5, s2
-; CHECKBE-NEXT:    vins.f16 s6, s3
-; CHECKBE-NEXT:    vmov.u16 r0, q0[7]
-; CHECKBE-NEXT:    vmov.16 q1[6], r0
-; CHECKBE-NEXT:    vmov.16 q1[7], r1
+; CHECKBE-NEXT:    vldrb.u16 q1, [r0, #1]
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 8
@@ -88,37 +44,13 @@ define <8 x i8> @inserti8_last(ptr %p) {
 define <8 x i16> @inserti8_first_sext(ptr %p) {
 ; CHECKLE-LABEL: inserti8_first_sext:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrb.s16 q1, [r0, #1]
-; CHECKLE-NEXT:    ldrsb.w r1, [r0]
-; CHECKLE-NEXT:    vmovx.f16 s10, s5
-; CHECKLE-NEXT:    vmovx.f16 s8, s4
-; CHECKLE-NEXT:    vins.f16 s10, s6
-; CHECKLE-NEXT:    vmovx.f16 s6, s6
-; CHECKLE-NEXT:    vmov.16 q0[0], r1
-; CHECKLE-NEXT:    vins.f16 s8, s5
-; CHECKLE-NEXT:    vins.f16 s6, s7
-; CHECKLE-NEXT:    vmov.f32 s1, s8
-; CHECKLE-NEXT:    vmov.f32 s2, s10
-; CHECKLE-NEXT:    vins.f16 s0, s4
-; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    vldrb.s16 q0, [r0]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti8_first_sext:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrb.s16 q0, [r0, #1]
-; CHECKBE-NEXT:    ldrsb.w r1, [r0]
-; CHECKBE-NEXT:    vmovx.f16 s6, s1
-; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vins.f16 s6, s2
-; CHECKBE-NEXT:    vmovx.f16 s2, s2
-; CHECKBE-NEXT:    vmov.16 q2[0], r1
-; CHECKBE-NEXT:    vins.f16 s4, s1
-; CHECKBE-NEXT:    vins.f16 s2, s3
-; CHECKBE-NEXT:    vins.f16 s8, s0
-; CHECKBE-NEXT:    vmov.f32 s9, s4
-; CHECKBE-NEXT:    vmov.f32 s10, s6
-; CHECKBE-NEXT:    vmov.f32 s11, s2
-; CHECKBE-NEXT:    vrev64.16 q0, q2
+; CHECKBE-NEXT:    vldrb.s16 q1, [r0]
+; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -133,32 +65,12 @@ define <8 x i16> @inserti8_first_sext(ptr %p) {
 define <8 x i16> @inserti8_last_sext(ptr %p) {
 ; CHECKLE-LABEL: inserti8_last_sext:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrb.s16 q1, [r0]
-; CHECKLE-NEXT:    ldrsb.w r1, [r0, #8]
-; CHECKLE-NEXT:    vmovx.f16 s0, s4
-; CHECKLE-NEXT:    vmovx.f16 s1, s5
-; CHECKLE-NEXT:    vmovx.f16 s2, s6
-; CHECKLE-NEXT:    vins.f16 s0, s5
-; CHECKLE-NEXT:    vins.f16 s1, s6
-; CHECKLE-NEXT:    vins.f16 s2, s7
-; CHECKLE-NEXT:    vmov.u16 r0, q1[7]
-; CHECKLE-NEXT:    vmov.16 q0[6], r0
-; CHECKLE-NEXT:    vmov.16 q0[7], r1
+; CHECKLE-NEXT:    vldrb.s16 q0, [r0, #1]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti8_last_sext:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrb.s16 q0, [r0]
-; CHECKBE-NEXT:    ldrsb.w r1, [r0, #8]
-; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vmovx.f16 s5, s1
-; CHECKBE-NEXT:    vmovx.f16 s6, s2
-; CHECKBE-NEXT:    vins.f16 s4, s1
-; CHECKBE-NEXT:    vins.f16 s5, s2
-; CHECKBE-NEXT:    vins.f16 s6, s3
-; CHECKBE-NEXT:    vmov.u16 r0, q0[7]
-; CHECKBE-NEXT:    vmov.16 q1[6], r0
-; CHECKBE-NEXT:    vmov.16 q1[7], r1
+; CHECKBE-NEXT:    vldrb.s16 q1, [r0, #1]
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 8
@@ -174,37 +86,13 @@ define <8 x i16> @inserti8_last_sext(ptr %p) {
 define <8 x i16> @inserti8_first_zext(ptr %p) {
 ; CHECKLE-LABEL: inserti8_first_zext:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrb.u16 q1, [r0, #1]
-; CHECKLE-NEXT:    ldrb r1, [r0]
-; CHECKLE-NEXT:    vmovx.f16 s10, s5
-; CHECKLE-NEXT:    vmovx.f16 s8, s4
-; CHECKLE-NEXT:    vins.f16 s10, s6
-; CHECKLE-NEXT:    vmovx.f16 s6, s6
-; CHECKLE-NEXT:    vmov.16 q0[0], r1
-; CHECKLE-NEXT:    vins.f16 s8, s5
-; CHECKLE-NEXT:    vins.f16 s6, s7
-; CHECKLE-NEXT:    vmov.f32 s1, s8
-; CHECKLE-NEXT:    vmov.f32 s2, s10
-; CHECKLE-NEXT:    vins.f16 s0, s4
-; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    vldrb.u16 q0, [r0]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti8_first_zext:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrb.u16 q0, [r0, #1]
-; CHECKBE-NEXT:    ldrb r1, [r0]
-; CHECKBE-NEXT:    vmovx.f16 s6, s1
-; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vins.f16 s6, s2
-; CHECKBE-NEXT:    vmovx.f16 s2, s2
-; CHECKBE-NEXT:    vmov.16 q2[0], r1
-; CHECKBE-NEXT:    vins.f16 s4, s1
-; CHECKBE-NEXT:    vins.f16 s2, s3
-; CHECKBE-NEXT:    vins.f16 s8, s0
-; CHECKBE-NEXT:    vmov.f32 s9, s4
-; CHECKBE-NEXT:    vmov.f32 s10, s6
-; CHECKBE-NEXT:    vmov.f32 s11, s2
-; CHECKBE-NEXT:    vrev64.16 q0, q2
+; CHECKBE-NEXT:    vldrb.u16 q1, [r0]
+; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 1
   %l1 = load <8 x i8>, ptr %q
@@ -219,32 +107,12 @@ define <8 x i16> @inserti8_first_zext(ptr %p) {
 define <8 x i16> @inserti8_last_zext(ptr %p) {
 ; CHECKLE-LABEL: inserti8_last_zext:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrb.u16 q1, [r0]
-; CHECKLE-NEXT:    ldrb r1, [r0, #8]
-; CHECKLE-NEXT:    vmovx.f16 s0, s4
-; CHECKLE-NEXT:    vmovx.f16 s1, s5
-; CHECKLE-NEXT:    vmovx.f16 s2, s6
-; CHECKLE-NEXT:    vins.f16 s0, s5
-; CHECKLE-NEXT:    vins.f16 s1, s6
-; CHECKLE-NEXT:    vins.f16 s2, s7
-; CHECKLE-NEXT:    vmov.u16 r0, q1[7]
-; CHECKLE-NEXT:    vmov.16 q0[6], r0
-; CHECKLE-NEXT:    vmov.16 q0[7], r1
+; CHECKLE-NEXT:    vldrb.u16 q0, [r0, #1]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti8_last_zext:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrb.u16 q0, [r0]
-; CHECKBE-NEXT:    ldrb r1, [r0, #8]
-; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vmovx.f16 s5, s1
-; CHECKBE-NEXT:    vmovx.f16 s6, s2
-; CHECKBE-NEXT:    vins.f16 s4, s1
-; CHECKBE-NEXT:    vins.f16 s5, s2
-; CHECKBE-NEXT:    vins.f16 s6, s3
-; CHECKBE-NEXT:    vmov.u16 r0, q0[7]
-; CHECKBE-NEXT:    vmov.16 q1[6], r0
-; CHECKBE-NEXT:    vmov.16 q1[7], r1
+; CHECKBE-NEXT:    vldrb.u16 q1, [r0, #1]
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 8
@@ -260,14 +128,9 @@ define <8 x i16> @inserti8_last_zext(ptr %p) {
 define <8 x i32> @inserti32_first(ptr %p) {
 ; CHECKLE-LABEL: inserti32_first:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #4]
 ; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #20]
-; CHECKLE-NEXT:    ldr r1, [r0]
-; CHECKLE-NEXT:    vmov.f32 s1, s4
-; CHECKLE-NEXT:    vmov.f32 s2, s5
-; CHECKLE-NEXT:    vmov.f32 s3, s6
-; CHECKLE-NEXT:    vmov.f32 s4, s7
-; CHECKLE-NEXT:    vmov.32 q0[0], r1
+; CHECKLE-NEXT:    vldr s4, [r0, #16]
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECKLE-NEXT:    vmov.f32 s5, s8
 ; CHECKLE-NEXT:    vmov.f32 s6, s9
 ; CHECKLE-NEXT:    vmov.f32 s7, s10
@@ -275,19 +138,14 @@ define <8 x i32> @inserti32_first(ptr %p) {
 ;
 ; CHECKBE-LABEL: inserti32_first:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #20]
-; CHECKBE-NEXT:    vldrw.u32 q2, [r0, #4]
-; CHECKBE-NEXT:    ldr r1, [r0]
-; CHECKBE-NEXT:    vmov.f32 s12, s11
-; CHECKBE-NEXT:    vmov.f32 s13, s0
-; CHECKBE-NEXT:    vmov.f32 s14, s1
-; CHECKBE-NEXT:    vmov.f32 s15, s2
-; CHECKBE-NEXT:    vrev64.32 q1, q3
-; CHECKBE-NEXT:    vmov.f32 s13, s8
-; CHECKBE-NEXT:    vmov.f32 s14, s9
-; CHECKBE-NEXT:    vmov.f32 s15, s10
-; CHECKBE-NEXT:    vmov.32 q3[0], r1
-; CHECKBE-NEXT:    vrev64.32 q0, q3
+; CHECKBE-NEXT:    vldrw.u32 q3, [r0, #20]
+; CHECKBE-NEXT:    vldrb.u8 q1, [r0]
+; CHECKBE-NEXT:    vldr s8, [r0, #16]
+; CHECKBE-NEXT:    vmov.f32 s9, s12
+; CHECKBE-NEXT:    vrev64.8 q0, q1
+; CHECKBE-NEXT:    vmov.f32 s10, s13
+; CHECKBE-NEXT:    vmov.f32 s11, s14
+; CHECKBE-NEXT:    vrev64.32 q1, q2
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <8 x i32>, ptr %q
@@ -300,34 +158,24 @@ define <8 x i32> @inserti32_first(ptr %p) {
 define <8 x i32> @inserti32_last(ptr %p) {
 ; CHECKLE-LABEL: inserti32_last:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
-; CHECKLE-NEXT:    ldr r1, [r0, #32]
-; CHECKLE-NEXT:    vmov.f32 s0, s1
-; CHECKLE-NEXT:    vmov.f32 s1, s2
-; CHECKLE-NEXT:    vmov.f32 s2, s3
-; CHECKLE-NEXT:    vmov.f32 s3, s8
-; CHECKLE-NEXT:    vmov.f32 s4, s9
-; CHECKLE-NEXT:    vmov.f32 s5, s10
-; CHECKLE-NEXT:    vmov.f32 s6, s11
-; CHECKLE-NEXT:    vmov.32 q1[3], r1
+; CHECKLE-NEXT:    vldrw.u32 q2, [r0]
+; CHECKLE-NEXT:    vldr s3, [r0, #16]
+; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #20]
+; CHECKLE-NEXT:    vmov.f32 s0, s9
+; CHECKLE-NEXT:    vmov.f32 s1, s10
+; CHECKLE-NEXT:    vmov.f32 s2, s11
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti32_last:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
-; CHECKBE-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECKBE-NEXT:    ldr r1, [r0, #32]
-; CHECKBE-NEXT:    vmov.f32 s8, s1
-; CHECKBE-NEXT:    vmov.f32 s9, s2
-; CHECKBE-NEXT:    vmov.f32 s10, s3
-; CHECKBE-NEXT:    vmov.f32 s11, s4
+; CHECKBE-NEXT:    vldrw.u32 q3, [r0]
+; CHECKBE-NEXT:    vldrb.u8 q0, [r0, #20]
+; CHECKBE-NEXT:    vldr s11, [r0, #16]
+; CHECKBE-NEXT:    vmov.f32 s8, s13
+; CHECKBE-NEXT:    vrev64.8 q1, q0
+; CHECKBE-NEXT:    vmov.f32 s9, s14
+; CHECKBE-NEXT:    vmov.f32 s10, s15
 ; CHECKBE-NEXT:    vrev64.32 q0, q2
-; CHECKBE-NEXT:    vmov.f32 s8, s5
-; CHECKBE-NEXT:    vmov.f32 s9, s6
-; CHECKBE-NEXT:    vmov.f32 s10, s7
-; CHECKBE-NEXT:    vmov.32 q2[3], r1
-; CHECKBE-NEXT:    vrev64.32 q1, q2
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 32
   %l1 = load <8 x i32>, ptr %p
@@ -340,37 +188,29 @@ define <8 x i32> @inserti32_last(ptr %p) {
 define <8 x i32> @inserti32_first_multiuse(ptr %p) {
 ; CHECKLE-LABEL: inserti32_first_multiuse:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #20]
-; CHECKLE-NEXT:    vldrw.u32 q0, [r0, #4]
-; CHECKLE-NEXT:    ldr r1, [r0]
-; CHECKLE-NEXT:    vmov.f32 s8, s3
-; CHECKLE-NEXT:    vmov.f32 s9, s4
-; CHECKLE-NEXT:    vmov.f32 s10, s5
-; CHECKLE-NEXT:    vmov.f32 s11, s6
-; CHECKLE-NEXT:    vadd.i32 q1, q1, q2
-; CHECKLE-NEXT:    vmov.f32 s9, s0
-; CHECKLE-NEXT:    vmov.f32 s10, s1
-; CHECKLE-NEXT:    vmov.f32 s11, s2
-; CHECKLE-NEXT:    vmov.32 q2[0], r1
-; CHECKLE-NEXT:    vadd.i32 q0, q0, q2
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0, #20]
+; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #4]
+; CHECKLE-NEXT:    vmov.f32 s4, s11
+; CHECKLE-NEXT:    vmov.f32 s5, s0
+; CHECKLE-NEXT:    vmov.f32 s6, s1
+; CHECKLE-NEXT:    vmov.f32 s7, s2
+; CHECKLE-NEXT:    vadd.i32 q1, q0, q1
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKLE-NEXT:    vadd.i32 q0, q2, q0
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: inserti32_first_multiuse:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q1, [r0, #20]
-; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #4]
-; CHECKBE-NEXT:    ldr r1, [r0]
-; CHECKBE-NEXT:    vmov.f32 s8, s3
-; CHECKBE-NEXT:    vmov.f32 s9, s4
-; CHECKBE-NEXT:    vmov.f32 s10, s5
-; CHECKBE-NEXT:    vmov.f32 s11, s6
-; CHECKBE-NEXT:    vadd.i32 q2, q1, q2
-; CHECKBE-NEXT:    vrev64.32 q1, q2
-; CHECKBE-NEXT:    vmov.f32 s9, s0
-; CHECKBE-NEXT:    vmov.f32 s10, s1
-; CHECKBE-NEXT:    vmov.f32 s11, s2
-; CHECKBE-NEXT:    vmov.32 q2[0], r1
-; CHECKBE-NEXT:    vadd.i32 q2, q0, q2
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #20]
+; CHECKBE-NEXT:    vldrw.u32 q2, [r0, #4]
+; CHECKBE-NEXT:    vmov.f32 s4, s11
+; CHECKBE-NEXT:    vmov.f32 s5, s0
+; CHECKBE-NEXT:    vmov.f32 s6, s1
+; CHECKBE-NEXT:    vmov.f32 s7, s2
+; CHECKBE-NEXT:    vadd.i32 q0, q0, q1
+; CHECKBE-NEXT:    vrev64.32 q1, q0
+; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
+; CHECKBE-NEXT:    vadd.i32 q2, q2, q0
 ; CHECKBE-NEXT:    vrev64.32 q0, q2
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 4
@@ -387,16 +227,12 @@ define <8 x i32> @inserti32_last_multiuse(ptr %p) {
 ; CHECKLE:       @ %bb.0:
 ; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECKLE-NEXT:    ldr r1, [r0, #32]
 ; CHECKLE-NEXT:    vmov.f32 s8, s1
 ; CHECKLE-NEXT:    vmov.f32 s9, s2
 ; CHECKLE-NEXT:    vmov.f32 s10, s3
 ; CHECKLE-NEXT:    vmov.f32 s11, s4
 ; CHECKLE-NEXT:    vadd.i32 q0, q0, q2
-; CHECKLE-NEXT:    vmov.f32 s8, s5
-; CHECKLE-NEXT:    vmov.f32 s9, s6
-; CHECKLE-NEXT:    vmov.f32 s10, s7
-; CHECKLE-NEXT:    vmov.32 q2[3], r1
+; CHECKLE-NEXT:    vldrw.u32 q2, [r0, #20]
 ; CHECKLE-NEXT:    vadd.i32 q1, q1, q2
 ; CHECKLE-NEXT:    bx lr
 ;
@@ -404,17 +240,13 @@ define <8 x i32> @inserti32_last_multiuse(ptr %p) {
 ; CHECKBE:       @ %bb.0:
 ; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECKBE-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECKBE-NEXT:    ldr r1, [r0, #32]
 ; CHECKBE-NEXT:    vmov.f32 s8, s1
 ; CHECKBE-NEXT:    vmov.f32 s9, s2
 ; CHECKBE-NEXT:    vmov.f32 s10, s3
 ; CHECKBE-NEXT:    vmov.f32 s11, s4
 ; CHECKBE-NEXT:    vadd.i32 q2, q0, q2
 ; CHECKBE-NEXT:    vrev64.32 q0, q2
-; CHECKBE-NEXT:    vmov.f32 s8, s5
-; CHECKBE-NEXT:    vmov.f32 s9, s6
-; CHECKBE-NEXT:    vmov.f32 s10, s7
-; CHECKBE-NEXT:    vmov.32 q2[3], r1
+; CHECKBE-NEXT:    vldrw.u32 q2, [r0, #20]
 ; CHECKBE-NEXT:    vadd.i32 q2, q1, q2
 ; CHECKBE-NEXT:    vrev64.32 q1, q2
 ; CHECKBE-NEXT:    bx lr
@@ -430,21 +262,13 @@ define <8 x i32> @inserti32_last_multiuse(ptr %p) {
 define <4 x float> @insertf32_first(ptr %p) {
 ; CHECKLE-LABEL: insertf32_first:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q1, [r0, #4]
-; CHECKLE-NEXT:    vldr s0, [r0]
-; CHECKLE-NEXT:    vmov.f32 s1, s4
-; CHECKLE-NEXT:    vmov.f32 s2, s5
-; CHECKLE-NEXT:    vmov.f32 s3, s6
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: insertf32_first:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q0, [r0, #4]
-; CHECKBE-NEXT:    vldr s4, [r0]
-; CHECKBE-NEXT:    vmov.f32 s5, s0
-; CHECKBE-NEXT:    vmov.f32 s6, s1
-; CHECKBE-NEXT:    vmov.f32 s7, s2
-; CHECKBE-NEXT:    vrev64.32 q0, q1
+; CHECKBE-NEXT:    vldrb.u8 q1, [r0]
+; CHECKBE-NEXT:    vrev64.8 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <4 x float>, ptr %q
@@ -457,21 +281,13 @@ define <4 x float> @insertf32_first(ptr %p) {
 define <4 x float> @insertf32_last(ptr %p) {
 ; CHECKLE-LABEL: insertf32_last:
 ; CHECKLE:       @ %bb.0:
-; CHECKLE-NEXT:    vldrw.u32 q1, [r0]
-; CHECKLE-NEXT:    vldr s3, [r0, #16]
-; CHECKLE-NEXT:    vmov.f32 s0, s5
-; CHECKLE-NEXT:    vmov.f32 s1, s6
-; CHECKLE-NEXT:    vmov.f32 s2, s7
+; CHECKLE-NEXT:    vldrw.u32 q0, [r0, #4]
 ; CHECKLE-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: insertf32_last:
 ; CHECKBE:       @ %bb.0:
-; CHECKBE-NEXT:    vldrw.u32 q0, [r0]
-; CHECKBE-NEXT:    vldr s7, [r0, #16]
-; CHECKBE-NEXT:    vmov.f32 s4, s1
-; CHECKBE-NEXT:    vmov.f32 s5, s2
-; CHECKBE-NEXT:    vmov.f32 s6, s3
-; CHECKBE-NEXT:    vrev64.32 q0, q1
+; CHECKBE-NEXT:    vldrb.u8 q1, [r0, #4]
+; CHECKBE-NEXT:    vrev64.8 q0, q1
 ; CHECKBE-NEXT:    bx lr
   %q = getelementptr inbounds i8, ptr %p, i32 16
   %l1 = load <4 x float>, ptr %p


        


More information about the llvm-commits mailing list