[llvm] d36c81e - [AArch64] Fold tree of offset loads combine

David Green via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 30 04:25:13 PDT 2023


Author: David Green
Date: 2023-06-30T12:25:07+01:00
New Revision: d36c81e7f6f09a46c802d9b64416c24253140e25

URL: https://github.com/llvm/llvm-project/commit/d36c81e7f6f09a46c802d9b64416c24253140e25
DIFF: https://github.com/llvm/llvm-project/commit/d36c81e7f6f09a46c802d9b64416c24253140e25.diff

LOG: [AArch64] Fold tree of offset loads combine

This attempts to fold trees of add(ext(load p), shl(ext(load p+4)) into a
single load of twice the size, that we extract the bottom part and top part so
that the shl can start to use a shll2 instruction. The two loads in that
example can also be larger trees of instructions, which are identical except
for the leaves which are all loads offset from the LHS, including buildvectors
of multiple loads. For example:
sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))

Whilst it can be common for the larger loads to replace LDP instructions (which
doesn't gain anything on its own), the larger loads in buildvectors can help
create more efficient code, and prevent the need for ld1 lane inserts which can
be more expensive than continuous loads.

This creates a fairly niche, fairly large combine that attempts to be fairly
general where it is beneficial. It helps some SLP vectorized code to avoid the
use of the more expensive ld1 lane inserting loads.

Differential Revision: https://reviews.llvm.org/D153972

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/extbinopload.ll
    llvm/test/CodeGen/AArch64/insert-extend.ll
    llvm/test/CodeGen/AArch64/reduce-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9b63e8fbc3f56..1afc70f00fd20 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18531,6 +18531,250 @@ static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, DL, MVT::i64));
 }
 
+static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
+  SDValue BV = peekThroughOneUseBitcasts(B);
+  if (!BV->hasOneUse())
+    return false;
+  if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
+    if (!Ld || !Ld->isSimple())
+      return false;
+    Loads.push_back(Ld);
+    return true;
+  } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
+             BV.getOpcode() == ISD::CONCAT_VECTORS) {
+    for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
+      auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
+      if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
+        return false;
+      Loads.push_back(Ld);
+    }
+    return true;
+  } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    // Try to find a tree of shuffles and concats from how IR shuffles of loads
+    // are lowered. Note that this only comes up because we do not always visit
+    // operands before uses. After that is fixed this can be removed and in the
+    // meantime this is fairly specific to the lowering we expect from IR.
+    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
+    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
+    //     t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
+    //       t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
+    //       t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
+    //     t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
+    //       t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
+    //   t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
+    //     t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
+    if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
+        B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
+        B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
+        B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
+        B.getOperand(1).getNumOperands() != 4)
+      return false;
+    auto SV1 = cast<ShuffleVectorSDNode>(B);
+    auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
+    int NumElts = B.getValueType().getVectorNumElements();
+    int NumSubElts = NumElts / 4;
+    for (int I = 0; I < NumSubElts; I++) {
+      // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
+      if (SV1->getMaskElt(I) != I ||
+          SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
+          SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
+          SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
+        return false;
+      // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
+      if (SV2->getMaskElt(I) != I ||
+          SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
+          SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
+        return false;
+    }
+    auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
+    auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
+    auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
+    auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
+    if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
+        !Ld2->isSimple() || !Ld3->isSimple())
+      return false;
+    Loads.push_back(Ld0);
+    Loads.push_back(Ld1);
+    Loads.push_back(Ld2);
+    Loads.push_back(Ld3);
+    return true;
+  }
+  return false;
+}
+
+static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
+                                            SelectionDAG &DAG,
+                                            unsigned &NumSubLoads) {
+  if (!Op0.hasOneUse() || !Op1.hasOneUse())
+    return false;
+
+  SmallVector<LoadSDNode *> Loads0, Loads1;
+  if (isLoadOrMultipleLoads(Op0, Loads0) &&
+      isLoadOrMultipleLoads(Op1, Loads1)) {
+    if (NumSubLoads && Loads0.size() != NumSubLoads)
+      return false;
+    NumSubLoads = Loads0.size();
+    return Loads0.size() == Loads1.size() &&
+           all_of(zip(Loads0, Loads1), [&DAG](auto L) {
+             unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
+             return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
+                    DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
+                                                       Size / 8, 1);
+           });
+  }
+
+  if (Op0.getOpcode() != Op1.getOpcode())
+    return false;
+
+  switch (Op0.getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+    return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),
+                                           DAG, NumSubLoads) &&
+           areLoadedOffsetButOtherwiseSame(Op0.getOperand(1), Op1.getOperand(1),
+                                           DAG, NumSubLoads);
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+    EVT XVT = Op0.getOperand(0).getValueType();
+    if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
+        XVT.getScalarSizeInBits() != 32)
+      return false;
+    return areLoadedOffsetButOtherwiseSame(Op0.getOperand(0), Op1.getOperand(0),
+                                           DAG, NumSubLoads);
+  }
+  return false;
+}
+
+// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
+// into a single load of twice the size, that we extract the bottom part and top
+// part so that the shl can use a shll2 instruction. The two loads in that
+// example can also be larger trees of instructions, which are identical except
+// for the leaves which are all loads offset from the LHS, including
+// buildvectors of multiple loads. For example the RHS tree could be
+// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
+// Whilst it can be common for the larger loads to replace LDP instructions
+// (which doesn't gain anything on it's own), the larger loads can help create
+// more efficient code, and in buildvectors prevent the need for ld1 lane
+// inserts which can be slower than normal loads.
+static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isFixedLengthVector() ||
+      (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
+       VT.getScalarSizeInBits() != 64))
+    return SDValue();
+
+  SDValue Other = N->getOperand(0);
+  SDValue Shift = N->getOperand(1);
+  if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
+    std::swap(Shift, Other);
+  APInt ShiftAmt;
+  if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
+      !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
+    return SDValue();
+
+  if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
+      !ISD::isExtOpcode(Other.getOpcode()) ||
+      Shift.getOperand(0).getOperand(0).getValueType() !=
+          Other.getOperand(0).getValueType() ||
+      !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
+    return SDValue();
+
+  SDValue Op0 = Other.getOperand(0);
+  SDValue Op1 = Shift.getOperand(0).getOperand(0);
+
+  unsigned NumSubLoads = 0;
+  if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
+    return SDValue();
+
+  // Attempt to rule out some unprofitable cases using heuristics (some working
+  // around suboptimal code generation), notably if the extend not be able to
+  // use ushll2 instructions as the types are not large enough. Otherwise zip's
+  // will need to be created which can increase the instruction count.
+  unsigned NumElts = Op0.getValueType().getVectorNumElements();
+  unsigned NumSubElts = NumElts / NumSubLoads;
+  if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
+      (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
+       Op0.getValueType().getSizeInBits() < 128 &&
+       !DAG.getTargetLoweringInfo().isTypeLegal(Op0.getValueType())))
+    return SDValue();
+
+  // Recreate the tree with the new combined loads.
+  std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
+      [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
+        EVT DVT =
+            Op0.getValueType().getDoubleNumVectorElementsVT(*DAG.getContext());
+
+        SmallVector<LoadSDNode *> Loads0, Loads1;
+        if (isLoadOrMultipleLoads(Op0, Loads0) &&
+            isLoadOrMultipleLoads(Op1, Loads1)) {
+          EVT LoadVT = EVT::getVectorVT(
+              *DAG.getContext(), Op0.getValueType().getScalarType(),
+              Op0.getValueType().getVectorNumElements() / Loads0.size());
+          EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+
+          SmallVector<SDValue> NewLoads;
+          for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
+            SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
+                                       L0->getBasePtr(), L0->getPointerInfo(),
+                                       L0->getOriginalAlign());
+            DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
+            DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
+            NewLoads.push_back(Load);
+          }
+          return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
+        }
+
+        SmallVector<SDValue> Ops;
+        for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
+          Ops.push_back(GenCombinedTree(O0, O1, DAG));
+        return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
+      };
+  SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
+
+  SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
+  int Hi = NumSubElts, Lo = 0;
+  for (unsigned i = 0; i < NumSubLoads; i++) {
+    for (unsigned j = 0; j < NumSubElts; j++) {
+      LowMask[i * NumSubElts + j] = Lo++;
+      HighMask[i * NumSubElts + j] = Hi++;
+    }
+    Lo += NumSubElts;
+    Hi += NumSubElts;
+  }
+  SDLoc DL(N);
+  SDValue Ext0, Ext1;
+  // Extract the top and bottom lanes, then extend the result. Possibly extend
+  // the result then extract the lanes if the two operands match as it produces
+  // slightly smaller code.
+  if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
+    SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(),
+                               NewOp, DAG.getConstant(0, DL, MVT::i64));
+    SDValue SubH =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
+                    DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
+    SDValue Extr0 =
+        DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
+    SDValue Extr1 =
+        DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
+    Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
+    Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
+  } else {
+    EVT DVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
+    SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
+    SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
+                               DAG.getConstant(0, DL, MVT::i64));
+    SDValue SubH =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
+                    DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
+    Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
+    Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
+  }
+  SDValue NShift =
+      DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
+  return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -18553,6 +18797,9 @@ static SDValue performAddSubCombine(SDNode *N,
   if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
     return Val;
 
+  if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
+    return Val;
+
   return performAddSubLongCombine(N, DCI);
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 3465cb998866a..aab41a5445764 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -55,9 +55,9 @@ define <4 x i16> @load_v4i8(ptr %p) {
 define <4 x i32> @load_v4i16_v4i32(ptr %p) {
 ; CHECK-LABEL: load_v4i16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d1, d0, [x0]
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    ret
   %l1 = load <4 x i16>, ptr %p
   %q = getelementptr i8, ptr %p, i32 8
@@ -91,11 +91,10 @@ define <4 x i64> @load_v4i32_v4i64(ptr %p) {
 define <4 x i32> @load_v4i8_v4i32(ptr %p) {
 ; CHECK-LABEL: load_v4i8_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s1, s0, [x0]
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    ret
   %l1 = load <4 x i8>, ptr %p
   %q = getelementptr i8, ptr %p, i32 4
@@ -110,30 +109,28 @@ define <4 x i32> @load_v4i8_v4i32(ptr %p) {
 define <4 x i32> @load_v4i12_v4i32(ptr %p) {
 ; CHECK-LABEL: load_v4i12_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #6]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w12, [x0, #10]
-; CHECK-NEXT:    and w10, w8, #0xfff
-; CHECK-NEXT:    ldrh w13, [x0, #4]
-; CHECK-NEXT:    and w11, w9, #0xfff
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr w9, [x0, #8]
+; CHECK-NEXT:    ubfx x10, x8, #48, #12
+; CHECK-NEXT:    lsr x11, x8, #60
+; CHECK-NEXT:    orr w11, w11, w9, lsl #4
+; CHECK-NEXT:    and w12, w8, #0xfff
+; CHECK-NEXT:    and w11, w11, #0xfff
 ; CHECK-NEXT:    fmov s0, w10
 ; CHECK-NEXT:    ubfx w10, w8, #12, #12
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    ubfx w11, w9, #12, #12
-; CHECK-NEXT:    orr x8, x8, x12, lsl #32
-; CHECK-NEXT:    orr x9, x9, x13, lsl #32
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    ubfx x8, x8, #24, #12
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    ubfx x9, x9, #24, #12
-; CHECK-NEXT:    mov v0.s[2], w8
-; CHECK-NEXT:    ubfx w8, w12, #4, #12
-; CHECK-NEXT:    mov v1.s[2], w9
-; CHECK-NEXT:    ubfx w9, w13, #4, #12
-; CHECK-NEXT:    mov v0.s[3], w8
-; CHECK-NEXT:    mov v1.s[3], w9
-; CHECK-NEXT:    shl v0.4s, v0.4s, #3
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fmov s1, w12
+; CHECK-NEXT:    mov v0.h[1], w11
+; CHECK-NEXT:    ubfx w11, w9, #8, #12
+; CHECK-NEXT:    mov v1.h[1], w10
+; CHECK-NEXT:    ubfx x10, x8, #24, #12
+; CHECK-NEXT:    lsr x9, x9, #20
+; CHECK-NEXT:    ubfx x8, x8, #36, #12
+; CHECK-NEXT:    mov v0.h[2], w11
+; CHECK-NEXT:    mov v1.h[2], w10
+; CHECK-NEXT:    mov v0.h[3], w9
+; CHECK-NEXT:    mov v1.h[3], w8
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    ret
   %l1 = load <4 x i12>, ptr %p
   %q = getelementptr i8, ptr %p, i32 6
@@ -148,9 +145,9 @@ define <4 x i32> @load_v4i12_v4i32(ptr %p) {
 define <8 x i16> @load_v8i8(ptr %p) {
 ; CHECK-LABEL: load_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d1, d0, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #3
-; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #3
+; CHECK-NEXT:    uaddw v0.8h, v1.8h, v0.8b
 ; CHECK-NEXT:    ret
   %l1 = load <8 x i8>, ptr %p
   %q = getelementptr i8, ptr %p, i32 8
@@ -165,11 +162,10 @@ define <8 x i16> @load_v8i8(ptr %p) {
 define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: loadadd_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d0, d1, [x0]
-; CHECK-NEXT:    ldp d3, d2, [x1]
-; CHECK-NEXT:    add v0.8b, v0.8b, v3.8b
-; CHECK-NEXT:    add v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #3
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #3
 ; CHECK-NEXT:    uaddw v0.8h, v1.8h, v0.8b
 ; CHECK-NEXT:    ret
   %l11 = load <8 x i8>, ptr %p1
@@ -190,14 +186,14 @@ define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) {
 define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: loadaddext_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d2, d0, [x0]
-; CHECK-NEXT:    ldp d3, d1, [x1]
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #3
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #3
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    uaddw v0.4s, v2.4s, v0.4h
 ; CHECK-NEXT:    ret
   %l11 = load <8 x i8>, ptr %p1
   %q1 = getelementptr i8, ptr %p1, i32 8
@@ -221,15 +217,10 @@ define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) {
 define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: loadaddext_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    ldp s2, s3, [x1]
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    add v1.4h, v1.4h, v3.4h
-; CHECK-NEXT:    add v0.4h, v0.4h, v2.4h
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #3
 ; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    ret
   %l11 = load <4 x i8>, ptr %p1
@@ -321,15 +312,14 @@ define <8 x i16> @load_bv_v4i8(ptr %p, ptr %q) {
 define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) {
 ; CHECK-LABEL: load_bv_v4i8_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #3
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v2.4s, v0.8h
-; CHECK-NEXT:    uaddw v0.4s, v3.4s, v0.4h
+; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v2.4s, v0.4h
+; CHECK-NEXT:    uaddw v1.4s, v3.4s, v1.4h
 ; CHECK-NEXT:    ret
   %j1 = load <4 x i8>, ptr %p
   %p1 = getelementptr i8, ptr %p, i32 4
@@ -349,12 +339,12 @@ define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) {
 define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) {
 ; CHECK-LABEL: load_bv_v4i16_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d0, d1, [x0]
-; CHECK-NEXT:    ldp d3, d2, [x1]
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
-; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT:    uaddw v1.4s, v2.4s, v3.4h
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v2.4s, v0.4h
+; CHECK-NEXT:    uaddw v1.4s, v3.4s, v1.4h
 ; CHECK-NEXT:    ret
   %j1 = load <4 x i16>, ptr %p
   %p1 = getelementptr i8, ptr %p, i32 8
@@ -575,30 +565,26 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) {
 ; CHECK-LABEL: double2_bv_4xv4i8_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    ldp s2, s3, [x0]
-; CHECK-NEXT:    ldp s4, s5, [x6]
-; CHECK-NEXT:    ldp s6, s7, [x4]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v4.s }[1], [x7], #4
-; CHECK-NEXT:    ld1 { v6.s }[1], [x5], #4
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x7]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x5]
-; CHECK-NEXT:    usubl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    usubl v4.8h, v0.8b, v4.8b
-; CHECK-NEXT:    usubl v1.8h, v1.8b, v5.8b
-; CHECK-NEXT:    usubl v3.8h, v3.8b, v7.8b
-; CHECK-NEXT:    shll v5.4s, v1.4h, #16
-; CHECK-NEXT:    shll v0.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v6.4s, v1.8h, #16
-; CHECK-NEXT:    saddw2 v1.4s, v3.4s, v2.8h
-; CHECK-NEXT:    saddw2 v3.4s, v6.4s, v4.8h
-; CHECK-NEXT:    saddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT:    saddw v2.4s, v5.4s, v4.4h
+; CHECK-NEXT:    ldr d0, [x4]
+; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d3, [x1]
+; CHECK-NEXT:    ldr d6, [x5]
+; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d4, [x3]
+; CHECK-NEXT:    ldr d5, [x7]
+; CHECK-NEXT:    ldr d7, [x6]
+; CHECK-NEXT:    usubl v0.8h, v2.8b, v0.8b
+; CHECK-NEXT:    usubl v2.8h, v3.8b, v6.8b
+; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
+; CHECK-NEXT:    usubl v3.8h, v1.8b, v7.8b
+; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    saddw v1.4s, v5.4s, v2.4h
+; CHECK-NEXT:    shll2 v2.4s, v3.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v4.8h, #16
+; CHECK-NEXT:    saddw v2.4s, v2.4s, v3.4h
+; CHECK-NEXT:    saddw v3.4s, v5.4s, v4.4h
 ; CHECK-NEXT:    ret
   %j1 = load <4 x i8>, ptr %p
   %p1 = getelementptr i8, ptr %p, i32 4
@@ -1270,12 +1256,11 @@ define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: commuted_loads:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d0, d1, [x0]
-; CHECK-NEXT:    ldp d3, d2, [x1]
-; CHECK-NEXT:    add v0.8b, v3.8b, v0.8b
-; CHECK-NEXT:    add v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    add v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #3
 ; CHECK-NEXT:    ushll v3.4s, v1.4h, #3
 ; CHECK-NEXT:    uaddw2 v1.4s, v2.4s, v0.8h
@@ -1353,3 +1338,74 @@ define <8 x i32> @commuted_sub(ptr %p1, ptr %p2) {
   %a = sub <8 x i32> %se2, %e1
   ret <8 x i32> %a
 }
+
+define <4 x i32> @bitcast(ptr %p) {
+; CHECK-LABEL: bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    ret
+  %l1b = load float, ptr %p
+  %l1 = bitcast float %l1b to <4 x i8>
+  %q = getelementptr i8, ptr %p, i32 4
+  %l2b = load float, ptr %q
+  %l2 = bitcast float %l2b to <4 x i8>
+  %e1 = zext <4 x i8> %l1 to <4 x i32>
+  %e2 = zext <4 x i8> %l2 to <4 x i32>
+  %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
+  %a = add <4 x i32> %e1, %e3
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @atomic(ptr %p) {
+; CHECK-LABEL: atomic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldar w8, [x0]
+; CHECK-NEXT:    ldr s0, [x0, #4]
+; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    zip1 v1.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+  %l1b = load atomic float, ptr %p acquire, align 4
+  %l1 = bitcast float %l1b to <4 x i8>
+  %q = getelementptr i8, ptr %p, i32 4
+  %l2b = load float, ptr %q
+  %l2 = bitcast float %l2b to <4 x i8>
+  %e1 = zext <4 x i8> %l1 to <4 x i32>
+  %e2 = zext <4 x i8> %l2 to <4 x i32>
+  %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
+  %a = add <4 x i32> %e1, %e3
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @volatile(ptr %p) {
+; CHECK-LABEL: volatile:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x0, #4]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
+; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %l1b = load volatile float, ptr %p
+  %l1 = bitcast float %l1b to <4 x i8>
+  %q = getelementptr i8, ptr %p, i32 4
+  %l2b = load float, ptr %q
+  %l2 = bitcast float %l2b to <4 x i8>
+  %e1 = zext <4 x i8> %l1 to <4 x i32>
+  %e2 = zext <4 x i8> %l2 to <4 x i32>
+  %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
+  %a = add <4 x i32> %e1, %e3
+  ret <4 x i32> %a
+}

diff  --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index e331a8dca6976..f2c61f6562bfb 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -48,120 +48,114 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    sxtw x8, w3
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    add x10, x2, x8
-; CHECK-NEXT:    add x11, x0, x9
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    sxtw x9, w3
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    add x12, x10, x8
-; CHECK-NEXT:    add x13, x11, x9
-; CHECK-NEXT:    add x8, x12, x8
-; CHECK-NEXT:    add x9, x13, x9
-; CHECK-NEXT:    ldp s0, s6, [x11]
-; CHECK-NEXT:    ldp s3, s7, [x10]
-; CHECK-NEXT:    ldp s1, s5, [x8]
-; CHECK-NEXT:    ldp s2, s4, [x9]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x12], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x13], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x2], #4
-; CHECK-NEXT:    ld1 { v0.s }[1], [x0], #4
-; CHECK-NEXT:    ld1 { v5.s }[1], [x12]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x13]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x2]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x0]
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v3.8b
-; CHECK-NEXT:    usubl v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    usubl v2.8h, v4.8b, v5.8b
+; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d2, [x10]
+; CHECK-NEXT:    add x10, x11, x9
+; CHECK-NEXT:    ldr d3, [x11]
+; CHECK-NEXT:    ldr d4, [x12]
+; CHECK-NEXT:    ldr d5, [x10]
+; CHECK-NEXT:    ldr d6, [x12, x8]
+; CHECK-NEXT:    ldr d7, [x10, x9]
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll v5.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    saddw2 v3.4s, v3.4s, v0.8h
-; CHECK-NEXT:    saddw v0.4s, v5.4s, v0.4h
-; CHECK-NEXT:    saddw2 v2.4s, v2.4s, v1.8h
-; CHECK-NEXT:    saddw v1.4s, v4.4s, v1.4h
-; CHECK-NEXT:    rev64 v6.4s, v0.4s
-; CHECK-NEXT:    rev64 v17.4s, v3.4s
+; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
+; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    rev64 v4.4s, v0.4s
 ; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    addp v7.4s, v1.4s, v2.4s
-; CHECK-NEXT:    rev64 v4.4s, v1.4s
-; CHECK-NEXT:    addp v16.4s, v0.4s, v3.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    ext v18.16b, v7.16b, v7.16b, #8
-; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    zip1 v5.4s, v0.4s, v3.4s
-; CHECK-NEXT:    uzp2 v19.4s, v7.4s, v16.4s
-; CHECK-NEXT:    uzp1 v7.4s, v7.4s, v16.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    uzp1 v6.4s, v18.4s, v16.4s
-; CHECK-NEXT:    zip2 v4.4s, v2.4s, v1.4s
-; CHECK-NEXT:    uzp2 v16.4s, v18.4s, v16.4s
-; CHECK-NEXT:    mov v2.s[1], v1.s[0]
-; CHECK-NEXT:    ext v1.16b, v0.16b, v5.16b, #8
-; CHECK-NEXT:    mov v0.s[3], v3.s[2]
-; CHECK-NEXT:    add v7.4s, v19.4s, v7.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v16.4s
-; CHECK-NEXT:    rev64 v5.4s, v7.4s
-; CHECK-NEXT:    mov v2.d[1], v1.d[1]
-; CHECK-NEXT:    mov v4.d[1], v0.d[1]
+; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
+; CHECK-NEXT:    rev64 v7.4s, v1.4s
+; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    rev64 v6.4s, v3.4s
-; CHECK-NEXT:    sub v0.4s, v7.4s, v5.4s
-; CHECK-NEXT:    add v5.4s, v4.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v6.4s
-; CHECK-NEXT:    rev64 v4.4s, v5.4s
-; CHECK-NEXT:    addp v6.4s, v7.4s, v5.4s
-; CHECK-NEXT:    rev64 v7.4s, v2.4s
-; CHECK-NEXT:    addp v3.4s, v3.4s, v2.4s
-; CHECK-NEXT:    sub v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    zip1 v16.4s, v6.4s, v6.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    ext v17.16b, v1.16b, v3.16b, #8
-; CHECK-NEXT:    ext v5.16b, v6.16b, v4.16b, #4
-; CHECK-NEXT:    ext v7.16b, v3.16b, v2.16b, #4
-; CHECK-NEXT:    ext v18.16b, v0.16b, v6.16b, #4
-; CHECK-NEXT:    trn2 v0.4s, v16.4s, v0.4s
-; CHECK-NEXT:    ext v16.16b, v17.16b, v1.16b, #4
-; CHECK-NEXT:    zip2 v7.4s, v7.4s, v3.4s
-; CHECK-NEXT:    zip2 v5.4s, v5.4s, v6.4s
-; CHECK-NEXT:    ext v18.16b, v18.16b, v18.16b, #4
-; CHECK-NEXT:    mov v1.s[2], v3.s[1]
-; CHECK-NEXT:    uzp2 v16.4s, v17.4s, v16.4s
-; CHECK-NEXT:    ext v7.16b, v2.16b, v7.16b, #12
-; CHECK-NEXT:    ext v5.16b, v4.16b, v5.16b, #12
-; CHECK-NEXT:    mov v2.s[2], v3.s[3]
-; CHECK-NEXT:    mov v4.s[2], v6.s[3]
-; CHECK-NEXT:    sub v17.4s, v0.4s, v18.4s
-; CHECK-NEXT:    mov v18.s[0], v6.s[1]
-; CHECK-NEXT:    sub v19.4s, v1.4s, v16.4s
-; CHECK-NEXT:    sub v20.4s, v2.4s, v7.4s
-; CHECK-NEXT:    sub v21.4s, v4.4s, v5.4s
-; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    mov v2.s[1], v3.s[2]
-; CHECK-NEXT:    mov v4.s[1], v6.s[2]
-; CHECK-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v5.4s
-; CHECK-NEXT:    mov v2.d[1], v20.d[1]
-; CHECK-NEXT:    mov v3.d[1], v21.d[1]
-; CHECK-NEXT:    mov v0.d[1], v17.d[1]
-; CHECK-NEXT:    mov v1.d[1], v19.d[1]
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v5.8h, v3.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
-; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
-; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
+; CHECK-NEXT:    sub v5.4s, v2.4s, v5.4s
+; CHECK-NEXT:    sub v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    zip1 v16.4s, v5.4s, v4.4s
+; CHECK-NEXT:    addp v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    sub v6.4s, v3.4s, v6.4s
+; CHECK-NEXT:    addp v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    zip2 v17.4s, v7.4s, v6.4s
+; CHECK-NEXT:    mov v7.s[1], v6.s[0]
+; CHECK-NEXT:    ext v2.16b, v5.16b, v16.16b, #8
+; CHECK-NEXT:    mov v5.s[3], v4.s[2]
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    uzp2 v4.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v7.d[1], v2.d[1]
+; CHECK-NEXT:    mov v17.d[1], v5.d[1]
+; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v0.4s
+; CHECK-NEXT:    uzp2 v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v17.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    sub v4.4s, v7.4s, v17.4s
+; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    rev64 v2.4s, v3.4s
+; CHECK-NEXT:    rev64 v5.4s, v4.4s
+; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    rev64 v6.4s, v1.4s
+; CHECK-NEXT:    addp v16.4s, v0.4s, v4.4s
+; CHECK-NEXT:    addp v17.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sub v4.4s, v4.4s, v5.4s
+; CHECK-NEXT:    sub v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v6.4s
+; CHECK-NEXT:    ext v3.16b, v16.16b, v4.16b, #4
+; CHECK-NEXT:    ext v5.16b, v0.16b, v16.16b, #8
+; CHECK-NEXT:    ext v6.16b, v17.16b, v2.16b, #4
+; CHECK-NEXT:    zip1 v7.4s, v17.4s, v17.4s
+; CHECK-NEXT:    zip2 v3.4s, v3.4s, v16.4s
+; CHECK-NEXT:    ext v18.16b, v5.16b, v0.16b, #4
+; CHECK-NEXT:    zip2 v6.4s, v6.4s, v17.4s
+; CHECK-NEXT:    trn2 v7.4s, v7.4s, v1.4s
+; CHECK-NEXT:    ext v1.16b, v1.16b, v17.16b, #4
+; CHECK-NEXT:    ext v3.16b, v4.16b, v3.16b, #12
+; CHECK-NEXT:    mov v0.s[2], v16.s[1]
+; CHECK-NEXT:    ext v6.16b, v2.16b, v6.16b, #12
+; CHECK-NEXT:    mov v4.s[2], v16.s[3]
+; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v18.4s
+; CHECK-NEXT:    mov v2.s[2], v17.s[3]
+; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    sub v18.4s, v4.4s, v3.4s
+; CHECK-NEXT:    sub v19.4s, v0.4s, v5.4s
+; CHECK-NEXT:    sub v20.4s, v2.4s, v6.4s
+; CHECK-NEXT:    mov v4.s[1], v16.s[2]
+; CHECK-NEXT:    sub v21.4s, v7.4s, v1.4s
+; CHECK-NEXT:    mov v2.s[1], v17.s[2]
+; CHECK-NEXT:    mov v0.s[1], v16.s[0]
+; CHECK-NEXT:    mov v1.s[0], v17.s[1]
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v3.4s, v5.4s, v3.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    mov v3.d[1], v18.d[1]
+; CHECK-NEXT:    mov v2.d[1], v20.d[1]
+; CHECK-NEXT:    mov v1.d[1], v21.d[1]
+; CHECK-NEXT:    mov v0.d[1], v19.d[1]
+; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v5.8h, v2.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v0.8h, #0
+; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v7.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v6.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0

diff  --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index b3d1388b55aac..a9365dbb1928c 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -4,123 +4,117 @@
 define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
 ; CHECK-LABEL: v1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x8, w1
-; CHECK-NEXT:    sxtw x10, w3
-; CHECK-NEXT:    add x9, x0, x8
-; CHECK-NEXT:    add x12, x2, x10
-; CHECK-NEXT:    add x11, x9, x8
-; CHECK-NEXT:    add x13, x12, x10
-; CHECK-NEXT:    add x8, x11, x8
-; CHECK-NEXT:    add x10, x13, x10
-; CHECK-NEXT:    ldp s1, s0, [x9]
-; CHECK-NEXT:    ldp s7, s6, [x12]
-; CHECK-NEXT:    ldp s3, s2, [x8]
-; CHECK-NEXT:    ldp s5, s4, [x10]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x13], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x11], #4
-; CHECK-NEXT:    ld1 { v7.s }[1], [x2], #4
-; CHECK-NEXT:    ld1 { v1.s }[1], [x0], #4
-; CHECK-NEXT:    ld1 { v4.s }[1], [x13]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x2]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x0]
-; CHECK-NEXT:    usubl v3.8h, v3.8b, v5.8b
-; CHECK-NEXT:    usubl v2.8h, v2.8b, v4.8b
-; CHECK-NEXT:    usubl v1.8h, v1.8b, v7.8b
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v6.8b
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll v5.4s, v0.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
-; CHECK-NEXT:    saddw v3.4s, v4.4s, v3.4h
-; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
+; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-NEXT:    sxtw x9, w3
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
+; CHECK-NEXT:    add x12, x10, x8
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d2, [x10]
+; CHECK-NEXT:    add x10, x11, x9
+; CHECK-NEXT:    ldr d6, [x12, x8]
+; CHECK-NEXT:    ldr d7, [x10, x9]
+; CHECK-NEXT:    ldr d3, [x11]
+; CHECK-NEXT:    ldr d4, [x12]
+; CHECK-NEXT:    ldr d5, [x10]
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
+; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
+; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
+; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
+; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    zip1 v5.4s, v2.4s, v0.4s
+; CHECK-NEXT:    zip2 v4.4s, v2.4s, v0.4s
+; CHECK-NEXT:    uzp2 v7.4s, v3.4s, v1.4s
+; CHECK-NEXT:    mov v17.16b, v1.16b
+; CHECK-NEXT:    zip2 v18.4s, v3.4s, v1.4s
+; CHECK-NEXT:    ext v19.16b, v2.16b, v5.16b, #8
+; CHECK-NEXT:    uzp2 v7.4s, v7.4s, v3.4s
+; CHECK-NEXT:    mov v2.s[3], v0.s[2]
+; CHECK-NEXT:    zip2 v6.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v16.16b, v3.16b, v3.16b, #12
-; CHECK-NEXT:    zip1 v17.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v7.16b, v3.16b
-; CHECK-NEXT:    zip2 v4.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip2 v6.4s, v1.4s, v0.4s
-; CHECK-NEXT:    zip2 v18.4s, v3.4s, v2.4s
-; CHECK-NEXT:    mov v7.s[0], v2.s[1]
-; CHECK-NEXT:    ext v16.16b, v2.16b, v16.16b, #12
-; CHECK-NEXT:    ext v19.16b, v1.16b, v17.16b, #8
-; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v3.4s
-; CHECK-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-NEXT:    mov v1.s[3], v0.s[2]
-; CHECK-NEXT:    mov v7.d[1], v17.d[1]
-; CHECK-NEXT:    mov v5.d[1], v6.d[1]
-; CHECK-NEXT:    mov v2.d[1], v19.d[1]
-; CHECK-NEXT:    mov v18.d[1], v1.d[1]
-; CHECK-NEXT:    mov v16.d[1], v6.d[1]
-; CHECK-NEXT:    mov v4.d[1], v1.d[1]
-; CHECK-NEXT:    add v0.4s, v7.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v18.4s
+; CHECK-NEXT:    mov v17.s[1], v3.s[0]
+; CHECK-NEXT:    mov v3.s[0], v1.s[1]
+; CHECK-NEXT:    mov v7.d[1], v4.d[1]
+; CHECK-NEXT:    mov v18.d[1], v2.d[1]
+; CHECK-NEXT:    mov v17.d[1], v19.d[1]
+; CHECK-NEXT:    mov v3.d[1], v5.d[1]
+; CHECK-NEXT:    ext v16.16b, v1.16b, v16.16b, #12
+; CHECK-NEXT:    add v1.4s, v7.4s, v18.4s
+; CHECK-NEXT:    mov v6.d[1], v2.d[1]
+; CHECK-NEXT:    add v0.4s, v3.4s, v17.4s
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
+; CHECK-NEXT:    sub v2.4s, v17.4s, v3.4s
+; CHECK-NEXT:    rev64 v3.4s, v1.4s
 ; CHECK-NEXT:    rev64 v5.4s, v0.4s
-; CHECK-NEXT:    sub v3.4s, v4.4s, v16.4s
-; CHECK-NEXT:    rev64 v4.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    sub v4.4s, v6.4s, v16.4s
+; CHECK-NEXT:    mov v3.d[1], v1.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    add v6.4s, v3.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    mov v4.d[1], v1.d[1]
-; CHECK-NEXT:    rev64 v7.4s, v2.4s
+; CHECK-NEXT:    add v6.4s, v4.4s, v2.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    rev64 v4.4s, v2.4s
 ; CHECK-NEXT:    rev64 v3.4s, v6.4s
 ; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v7.4s, v2.4s, v7.4s
+; CHECK-NEXT:    addp v7.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    addp v5.4s, v1.4s, v6.4s
-; CHECK-NEXT:    addp v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
 ; CHECK-NEXT:    sub v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    rev64 v4.4s, v0.4s
-; CHECK-NEXT:    rev64 v6.4s, v1.4s
-; CHECK-NEXT:    zip1 v16.4s, v5.4s, v5.4s
-; CHECK-NEXT:    ext v17.16b, v2.16b, v7.16b, #4
-; CHECK-NEXT:    ext v18.16b, v5.16b, v3.16b, #4
-; CHECK-NEXT:    sub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v6.4s
-; CHECK-NEXT:    ext v4.16b, v0.16b, v2.16b, #8
-; CHECK-NEXT:    ext v6.16b, v1.16b, v5.16b, #4
-; CHECK-NEXT:    trn2 v1.4s, v16.4s, v1.4s
-; CHECK-NEXT:    zip2 v16.4s, v17.4s, v2.4s
-; CHECK-NEXT:    zip2 v17.4s, v18.4s, v5.4s
-; CHECK-NEXT:    ext v18.16b, v4.16b, v0.16b, #4
-; CHECK-NEXT:    ext v6.16b, v6.16b, v6.16b, #4
-; CHECK-NEXT:    ext v16.16b, v7.16b, v16.16b, #12
-; CHECK-NEXT:    ext v17.16b, v3.16b, v17.16b, #12
+; CHECK-NEXT:    rev64 v6.4s, v0.4s
+; CHECK-NEXT:    ext v4.16b, v7.16b, v2.16b, #4
+; CHECK-NEXT:    rev64 v16.4s, v1.4s
+; CHECK-NEXT:    ext v17.16b, v5.16b, v3.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
+; CHECK-NEXT:    zip2 v4.4s, v4.4s, v7.4s
+; CHECK-NEXT:    ext v6.16b, v0.16b, v7.16b, #8
+; CHECK-NEXT:    sub v1.4s, v1.4s, v16.4s
+; CHECK-NEXT:    zip2 v16.4s, v17.4s, v5.4s
+; CHECK-NEXT:    zip1 v18.4s, v5.4s, v5.4s
+; CHECK-NEXT:    ext v19.16b, v1.16b, v5.16b, #4
+; CHECK-NEXT:    ext v4.16b, v2.16b, v4.16b, #12
+; CHECK-NEXT:    mov v2.s[2], v7.s[3]
+; CHECK-NEXT:    ext v17.16b, v6.16b, v0.16b, #4
+; CHECK-NEXT:    ext v16.16b, v3.16b, v16.16b, #12
 ; CHECK-NEXT:    mov v3.s[2], v5.s[3]
-; CHECK-NEXT:    mov v7.s[2], v2.s[3]
-; CHECK-NEXT:    mov v0.s[2], v2.s[1]
-; CHECK-NEXT:    uzp2 v4.4s, v4.4s, v18.4s
-; CHECK-NEXT:    sub v20.4s, v3.4s, v17.4s
-; CHECK-NEXT:    sub v21.4s, v7.4s, v16.4s
+; CHECK-NEXT:    trn2 v1.4s, v18.4s, v1.4s
+; CHECK-NEXT:    ext v18.16b, v19.16b, v19.16b, #4
+; CHECK-NEXT:    mov v0.s[2], v7.s[1]
+; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v17.4s
+; CHECK-NEXT:    sub v17.4s, v2.4s, v4.4s
+; CHECK-NEXT:    sub v21.4s, v3.4s, v16.4s
 ; CHECK-NEXT:    mov v3.s[1], v5.s[2]
-; CHECK-NEXT:    mov v7.s[1], v2.s[2]
-; CHECK-NEXT:    sub v18.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mov v6.s[0], v5.s[1]
-; CHECK-NEXT:    sub v19.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    add v2.4s, v3.4s, v17.4s
-; CHECK-NEXT:    add v3.4s, v7.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
+; CHECK-NEXT:    mov v2.s[1], v7.s[2]
+; CHECK-NEXT:    sub v19.4s, v1.4s, v18.4s
+; CHECK-NEXT:    mov v18.s[0], v5.s[1]
+; CHECK-NEXT:    sub v20.4s, v0.4s, v6.4s
+; CHECK-NEXT:    mov v0.s[1], v7.s[0]
+; CHECK-NEXT:    add v3.4s, v3.4s, v16.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v18.4s
+; CHECK-NEXT:    mov v2.d[1], v17.d[1]
 ; CHECK-NEXT:    mov v3.d[1], v21.d[1]
-; CHECK-NEXT:    mov v2.d[1], v20.d[1]
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mov v1.d[1], v18.d[1]
-; CHECK-NEXT:    mov v0.d[1], v19.d[1]
-; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
-; CHECK-NEXT:    cmlt v7.8h, v2.8h, #0
+; CHECK-NEXT:    add v0.4s, v0.4s, v6.4s
+; CHECK-NEXT:    mov v1.d[1], v19.d[1]
+; CHECK-NEXT:    mov v0.d[1], v20.d[1]
+; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v3.8h, #0
 ; CHECK-NEXT:    cmlt v4.8h, v1.8h, #0
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v3.4s, v7.4s, v3.4s
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
 ; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v4.16b
 ; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
@@ -226,121 +220,115 @@ entry:
 define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
 ; CHECK-LABEL: v2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x8, w1
-; CHECK-NEXT:    sxtw x10, w3
-; CHECK-NEXT:    add x9, x0, x8
-; CHECK-NEXT:    add x12, x2, x10
-; CHECK-NEXT:    add x11, x9, x8
-; CHECK-NEXT:    add x13, x12, x10
-; CHECK-NEXT:    add x8, x11, x8
-; CHECK-NEXT:    add x10, x13, x10
-; CHECK-NEXT:    ldp s1, s0, [x9]
-; CHECK-NEXT:    ldp s7, s6, [x12]
-; CHECK-NEXT:    ldp s3, s2, [x8]
-; CHECK-NEXT:    ldp s5, s4, [x10]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x13], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x11], #4
-; CHECK-NEXT:    ld1 { v7.s }[1], [x2], #4
-; CHECK-NEXT:    ld1 { v1.s }[1], [x0], #4
-; CHECK-NEXT:    ld1 { v4.s }[1], [x13]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x2]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x0]
-; CHECK-NEXT:    usubl v3.8h, v3.8b, v5.8b
-; CHECK-NEXT:    usubl v2.8h, v2.8b, v4.8b
-; CHECK-NEXT:    usubl v1.8h, v1.8b, v7.8b
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v6.8b
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    shll v5.4s, v0.4h, #16
-; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
-; CHECK-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
-; CHECK-NEXT:    saddw v3.4s, v4.4s, v3.4h
-; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
+; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-NEXT:    sxtw x9, w3
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
+; CHECK-NEXT:    add x12, x10, x8
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d2, [x10]
+; CHECK-NEXT:    add x10, x11, x9
+; CHECK-NEXT:    ldr d6, [x12, x8]
+; CHECK-NEXT:    ldr d7, [x10, x9]
+; CHECK-NEXT:    ldr d3, [x11]
+; CHECK-NEXT:    ldr d4, [x12]
+; CHECK-NEXT:    ldr d5, [x10]
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
+; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
+; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
+; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
+; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    zip1 v5.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #12
-; CHECK-NEXT:    zip1 v7.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v7.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    mov v16.16b, v3.16b
-; CHECK-NEXT:    zip2 v4.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip2 v6.4s, v1.4s, v0.4s
-; CHECK-NEXT:    zip2 v18.4s, v3.4s, v2.4s
-; CHECK-NEXT:    mov v16.s[0], v2.s[1]
-; CHECK-NEXT:    ext v19.16b, v1.16b, v7.16b, #8
-; CHECK-NEXT:    ext v17.16b, v2.16b, v17.16b, #12
-; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v3.4s
-; CHECK-NEXT:    mov v1.s[3], v0.s[2]
-; CHECK-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v5.d[1], v6.d[1]
-; CHECK-NEXT:    mov v18.d[1], v1.d[1]
-; CHECK-NEXT:    mov v2.d[1], v19.d[1]
-; CHECK-NEXT:    mov v4.d[1], v1.d[1]
-; CHECK-NEXT:    mov v17.d[1], v6.d[1]
-; CHECK-NEXT:    add v0.4s, v5.4s, v18.4s
-; CHECK-NEXT:    add v1.4s, v16.4s, v2.4s
+; CHECK-NEXT:    zip2 v4.4s, v2.4s, v0.4s
+; CHECK-NEXT:    zip2 v6.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v18.4s, v3.4s, v1.4s
+; CHECK-NEXT:    ext v19.16b, v2.16b, v5.16b, #8
+; CHECK-NEXT:    mov v16.s[0], v1.s[1]
+; CHECK-NEXT:    ext v17.16b, v1.16b, v17.16b, #12
+; CHECK-NEXT:    uzp2 v7.4s, v7.4s, v3.4s
+; CHECK-NEXT:    mov v2.s[3], v0.s[2]
+; CHECK-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-NEXT:    mov v16.d[1], v5.d[1]
+; CHECK-NEXT:    mov v7.d[1], v4.d[1]
+; CHECK-NEXT:    mov v18.d[1], v2.d[1]
+; CHECK-NEXT:    mov v1.d[1], v19.d[1]
+; CHECK-NEXT:    mov v6.d[1], v2.d[1]
+; CHECK-NEXT:    mov v17.d[1], v4.d[1]
+; CHECK-NEXT:    add v0.4s, v7.4s, v18.4s
+; CHECK-NEXT:    add v2.4s, v16.4s, v1.4s
 ; CHECK-NEXT:    rev64 v3.4s, v0.4s
-; CHECK-NEXT:    rev64 v5.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v16.4s
-; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
+; CHECK-NEXT:    rev64 v4.4s, v2.4s
+; CHECK-NEXT:    sub v5.4s, v6.4s, v17.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v16.4s
 ; CHECK-NEXT:    mov v3.d[1], v0.d[1]
-; CHECK-NEXT:    mov v5.d[1], v1.d[1]
-; CHECK-NEXT:    add v6.4s, v4.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    zip1 v3.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip1 v4.4s, v0.4s, v6.4s
+; CHECK-NEXT:    mov v4.d[1], v2.d[1]
+; CHECK-NEXT:    add v6.4s, v5.4s, v1.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    zip1 v3.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mov v17.16b, v1.16b
-; CHECK-NEXT:    zip2 v7.4s, v0.4s, v6.4s
-; CHECK-NEXT:    ext v16.16b, v1.16b, v3.16b, #8
-; CHECK-NEXT:    trn2 v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    zip2 v4.4s, v2.4s, v1.4s
+; CHECK-NEXT:    zip1 v7.4s, v0.4s, v6.4s
+; CHECK-NEXT:    ext v16.16b, v2.16b, v3.16b, #8
+; CHECK-NEXT:    zip2 v17.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v0.4s
-; CHECK-NEXT:    zip2 v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    mov v17.s[3], v2.s[2]
-; CHECK-NEXT:    mov v0.s[1], v6.s[1]
-; CHECK-NEXT:    mov v4.d[1], v16.d[1]
-; CHECK-NEXT:    mov v5.d[1], v1.d[1]
-; CHECK-NEXT:    mov v7.d[1], v17.d[1]
-; CHECK-NEXT:    mov v0.d[1], v3.d[1]
-; CHECK-NEXT:    add v1.4s, v7.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v0.4s, v4.4s, v0.4s
-; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #4
-; CHECK-NEXT:    ext v16.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
-; CHECK-NEXT:    zip2 v5.4s, v0.4s, v2.4s
-; CHECK-NEXT:    zip1 v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    zip2 v7.4s, v1.4s, v3.4s
-; CHECK-NEXT:    zip2 v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    zip1 v17.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip2 v2.4s, v2.4s, v0.4s
-; CHECK-NEXT:    ext v0.16b, v4.16b, v0.16b, #8
-; CHECK-NEXT:    ext v3.16b, v16.16b, v3.16b, #8
-; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
-; CHECK-NEXT:    ext v0.16b, v0.16b, v4.16b, #4
-; CHECK-NEXT:    ext v3.16b, v3.16b, v16.16b, #4
-; CHECK-NEXT:    sub v5.4s, v6.4s, v17.4s
-; CHECK-NEXT:    cmlt v7.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v17.8h, v1.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v5.8h, #0
-; CHECK-NEXT:    add v1.4s, v17.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v7.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v6.4s, v5.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v17.16b
-; CHECK-NEXT:    cmlt v3.8h, v0.8h, #0
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    eor v2.16b, v4.16b, v6.16b
+; CHECK-NEXT:    mov v2.s[3], v1.s[2]
+; CHECK-NEXT:    mov v18.16b, v0.16b
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    mov v18.s[1], v6.s[1]
+; CHECK-NEXT:    mov v5.d[1], v4.d[1]
+; CHECK-NEXT:    mov v17.d[1], v2.d[1]
+; CHECK-NEXT:    mov v0.d[1], v16.d[1]
+; CHECK-NEXT:    mov v18.d[1], v3.d[1]
+; CHECK-NEXT:    add v1.4s, v17.4s, v5.4s
+; CHECK-NEXT:    sub v2.4s, v5.4s, v17.4s
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    add v3.4s, v18.4s, v0.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v18.4s
+; CHECK-NEXT:    ext v5.16b, v3.16b, v3.16b, #4
+; CHECK-NEXT:    ext v16.16b, v4.16b, v2.16b, #8
+; CHECK-NEXT:    zip1 v6.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip2 v7.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v17.16b, v5.16b, v0.16b, #8
+; CHECK-NEXT:    zip2 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    zip2 v2.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v4.16b, v16.16b, v4.16b, #4
+; CHECK-NEXT:    zip1 v16.4s, v3.4s, v0.4s
+; CHECK-NEXT:    zip2 v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    ext v5.16b, v17.16b, v5.16b, #4
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    sub v3.4s, v6.4s, v16.4s
+; CHECK-NEXT:    sub v0.4s, v7.4s, v0.4s
+; CHECK-NEXT:    cmlt v6.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v0.8h, #0
+; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
+; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
+; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
+; CHECK-NEXT:    cmlt v5.8h, v2.8h, #0
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v7.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v6.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    eor v1.16b, v3.16b, v4.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    lsr w9, w8, #16
@@ -446,117 +434,112 @@ entry:
 define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
 ; CHECK-LABEL: v3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    sxtw x8, w3
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    add x10, x2, x8
-; CHECK-NEXT:    add x11, x0, x9
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-NEXT:    sxtw x9, w3
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
 ; CHECK-NEXT:    add x12, x10, x8
-; CHECK-NEXT:    add x13, x11, x9
-; CHECK-NEXT:    add x8, x12, x8
-; CHECK-NEXT:    add x9, x13, x9
-; CHECK-NEXT:    ldp s0, s6, [x11]
-; CHECK-NEXT:    ldp s3, s7, [x10]
-; CHECK-NEXT:    ldp s1, s5, [x8]
-; CHECK-NEXT:    ldp s2, s4, [x9]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x12], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x13], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x2], #4
-; CHECK-NEXT:    ld1 { v0.s }[1], [x0], #4
-; CHECK-NEXT:    ld1 { v5.s }[1], [x12]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x13]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x2]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x0]
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v3.8b
-; CHECK-NEXT:    usubl v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    usubl v2.8h, v4.8b, v5.8b
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d2, [x10]
+; CHECK-NEXT:    add x10, x11, x9
+; CHECK-NEXT:    ldr d4, [x12, x8]
+; CHECK-NEXT:    ldr d5, [x10, x9]
+; CHECK-NEXT:    ldr d3, [x11]
+; CHECK-NEXT:    ldr d6, [x12]
+; CHECK-NEXT:    ldr d7, [x10]
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
-; CHECK-NEXT:    shll v4.4s, v2.4h, #16
-; CHECK-NEXT:    shll v5.4s, v3.4h, #16
-; CHECK-NEXT:    shll2 v3.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v2.4s, v2.8h, #16
-; CHECK-NEXT:    saddw2 v3.4s, v3.4s, v0.8h
-; CHECK-NEXT:    saddw v0.4s, v5.4s, v0.4h
-; CHECK-NEXT:    saddw2 v2.4s, v2.4s, v1.8h
-; CHECK-NEXT:    rev64 v17.4s, v3.4s
-; CHECK-NEXT:    rev64 v6.4s, v0.4s
-; CHECK-NEXT:    saddw v1.4s, v4.4s, v1.4h
+; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
+; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
+; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
+; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    rev64 v4.4s, v0.4s
 ; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    addp v16.4s, v0.4s, v3.4s
-; CHECK-NEXT:    rev64 v4.4s, v1.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    addp v7.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ext v17.16b, v0.16b, v3.16b, #4
+; CHECK-NEXT:    rev64 v7.4s, v1.4s
+; CHECK-NEXT:    rev64 v16.4s, v3.4s
+; CHECK-NEXT:    addp v6.4s, v2.4s, v0.4s
+; CHECK-NEXT:    addp v17.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    uzp2 v5.4s, v7.4s, v16.4s
-; CHECK-NEXT:    ext v4.16b, v16.16b, v16.16b, #8
-; CHECK-NEXT:    uzp1 v16.4s, v7.4s, v16.4s
-; CHECK-NEXT:    zip2 v6.4s, v1.4s, v2.4s
-; CHECK-NEXT:    mov v3.s[3], v0.s[2]
-; CHECK-NEXT:    zip1 v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ext v0.16b, v17.16b, v0.16b, #4
-; CHECK-NEXT:    rev64 v2.4s, v5.4s
-; CHECK-NEXT:    uzp1 v5.4s, v7.4s, v4.4s
-; CHECK-NEXT:    rev64 v16.4s, v16.4s
-; CHECK-NEXT:    uzp2 v4.4s, v7.4s, v4.4s
-; CHECK-NEXT:    mov v6.d[1], v3.d[1]
+; CHECK-NEXT:    sub v3.4s, v3.4s, v16.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    ext v4.16b, v2.16b, v0.16b, #4
+; CHECK-NEXT:    zip2 v5.4s, v1.4s, v3.4s
+; CHECK-NEXT:    mov v0.s[3], v2.s[2]
+; CHECK-NEXT:    uzp2 v7.4s, v17.4s, v6.4s
+; CHECK-NEXT:    zip1 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ext v3.16b, v6.16b, v6.16b, #8
+; CHECK-NEXT:    mov v5.d[1], v0.d[1]
+; CHECK-NEXT:    ext v0.16b, v4.16b, v2.16b, #4
+; CHECK-NEXT:    uzp1 v2.4s, v17.4s, v6.4s
+; CHECK-NEXT:    rev64 v4.4s, v7.4s
 ; CHECK-NEXT:    mov v1.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v2.4s, v16.4s
-; CHECK-NEXT:    sub v2.4s, v5.4s, v4.4s
-; CHECK-NEXT:    sub v3.4s, v1.4s, v6.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    zip1 v4.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip1 v5.4s, v0.4s, v1.4s
-; CHECK-NEXT:    uzp2 v6.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v7.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip2 v16.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v17.16b, v2.16b, v4.16b, #8
-; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v0.4s
-; CHECK-NEXT:    mov v2.s[3], v3.s[2]
-; CHECK-NEXT:    trn2 v3.4s, v0.4s, v5.4s
-; CHECK-NEXT:    mov v0.s[1], v1.s[1]
-; CHECK-NEXT:    mov v6.d[1], v7.d[1]
-; CHECK-NEXT:    mov v16.d[1], v2.d[1]
-; CHECK-NEXT:    mov v3.d[1], v17.d[1]
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    add v1.4s, v6.4s, v16.4s
-; CHECK-NEXT:    sub v2.4s, v16.4s, v6.4s
-; CHECK-NEXT:    add v7.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v6.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    sub v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v3.16b, v7.16b, v7.16b, #4
-; CHECK-NEXT:    zip1 v4.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip2 v5.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip2 v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v16.4s, v0.4s, v7.4s
-; CHECK-NEXT:    zip1 v17.4s, v7.4s, v0.4s
-; CHECK-NEXT:    zip2 v7.4s, v7.4s, v0.4s
-; CHECK-NEXT:    ext v2.16b, v6.16b, v2.16b, #8
-; CHECK-NEXT:    ext v0.16b, v3.16b, v0.16b, #8
-; CHECK-NEXT:    add v1.4s, v16.4s, v1.4s
-; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v6.16b, #4
-; CHECK-NEXT:    ext v0.16b, v0.16b, v3.16b, #4
-; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
-; CHECK-NEXT:    cmlt v5.8h, v4.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmlt v2.8h, v1.8h, #0
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    cmlt v7.8h, v0.8h, #0
-; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
-; CHECK-NEXT:    eor v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    eor v0.16b, v0.16b, v7.16b
+; CHECK-NEXT:    rev64 v0.4s, v2.4s
+; CHECK-NEXT:    uzp1 v2.4s, v17.4s, v3.4s
+; CHECK-NEXT:    uzp2 v3.4s, v17.4s, v3.4s
+; CHECK-NEXT:    add v6.4s, v5.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    zip1 v3.4s, v0.4s, v6.4s
+; CHECK-NEXT:    zip1 v4.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mov v7.16b, v0.16b
+; CHECK-NEXT:    uzp2 v5.4s, v0.4s, v6.4s
+; CHECK-NEXT:    trn2 v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v16.16b, v2.16b, v4.16b, #8
+; CHECK-NEXT:    mov v7.s[1], v6.s[1]
+; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v0.4s
+; CHECK-NEXT:    zip2 v0.4s, v0.4s, v6.4s
+; CHECK-NEXT:    mov v3.d[1], v16.d[1]
+; CHECK-NEXT:    zip2 v6.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mov v7.d[1], v4.d[1]
+; CHECK-NEXT:    mov v2.s[3], v1.s[2]
+; CHECK-NEXT:    mov v5.d[1], v6.d[1]
+; CHECK-NEXT:    add v1.4s, v3.4s, v7.4s
+; CHECK-NEXT:    mov v0.d[1], v2.d[1]
+; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    sub v3.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v5.4s, v0.4s
+; CHECK-NEXT:    ext v6.16b, v2.16b, v3.16b, #8
+; CHECK-NEXT:    ext v7.16b, v4.16b, v4.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    zip2 v5.4s, v3.4s, v1.4s
+; CHECK-NEXT:    ext v2.16b, v6.16b, v2.16b, #4
+; CHECK-NEXT:    ext v6.16b, v7.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v16.4s, v4.4s, v0.4s
+; CHECK-NEXT:    zip2 v17.4s, v4.4s, v0.4s
+; CHECK-NEXT:    zip2 v0.4s, v0.4s, v4.4s
+; CHECK-NEXT:    ext v4.16b, v6.16b, v7.16b, #4
+; CHECK-NEXT:    zip1 v6.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    sub v3.4s, v16.4s, v6.4s
+; CHECK-NEXT:    sub v1.4s, v17.4s, v1.4s
+; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
+; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-NEXT:    cmlt v5.8h, v2.8h, #0
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    eor v1.16b, v3.16b, v4.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    lsr w9, w8, #16


        


More information about the llvm-commits mailing list