[llvm] [WIP][RFC] Implementation for SVE2 long operations (PR #89310)

Thu Apr 18 14:00:43 PDT 2024

https://github.com/UsmanNadeem created https://github.com/llvm/llvm-project/pull/89310

I have written this patch to show the kind of optimized codegen we should expect and to get feedback on the codegen approach.

Also note that the loop vectorizer currently does not generate wide scalable vector IR (probably because of the cost model), so the test case attached is manually converted from the fixed vector IR to scalable vector form.

There are a few issues which make SVE2 widening op implementation not so straight forward.
- SVE2 widening operations differ from NEON in the sense that instead of operating on the hi/lo halves they operate on even/odd halves of the vector . This makes things tricky because in order to get the "natural" form of the vector i.e. hi/lo halves we need to interleave the results from the bottom/top instructions. So there is no 1-1 mapping due to the extra shuffles needed and we have to bundle the bot/top+interleave.
- It is difficult to do patterns on legal types because legalization will produce unpacks which are of hi/lo form. Even/odd form of operations needs to reason about lanes across the middle boundary, thus we need access to wide operations of illegal types for lowering to sve2 long operations.
- Since the order of lanes in the input vectors do not matter for basic arithmetic operations (as long as both LHS and RHS lanes correspond to each other), if a widening instruction feeds into another widening instruction then some of the intermediate shuffles can be optimized away. Right now I have added the logic to the same function and handle this during lowering but I feel that it can be a combine on the `vector_interleave` node (I am not fully sure, need to check the DAGs for various types.).

Example:

```
// Legalized operations are difficult to pattern match for SVE2
8xi16 V = 7 6 5 4 3 2 1 0 
v_unpklo  = 3 2 1 0
v_unpkhi= 7 6 5 4
add(v_unpklo, v_unpklo) = 6 4 2 0
add(v_unpkhi, v_unpkhi) = 14 12 10 8

// we cannot look at any of the above two legal adds in isolation.

// SVE addlb(v, v) will work on lanes 6 4 2 0 and produce 12 8 4 0
// SVE addlt(v, v) will work on lanes  7 5 3 1 and produce 14 10 6 2
// We need shuffles to get  [14 12 10 8] and [6 4 2 0].
// There is no root node using which we can access both adds and 
// we need to generate both bot/top + shuffles at the same time.
```


>From ebe53884f226de8003a8d6e098bf4e7335924f49 Mon Sep 17 00:00:00 2001
From: "Nadeem, Usman" <mnadeem at quicinc.com>
Date: Thu, 18 Apr 2024 12:39:01 -0700
Subject: [PATCH] [WIP][RFC] Implementation for SVE2 long operations

Change-Id: I6e8f70342ff25f6ab21cd5666c9085be0fa2e206
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 292 ++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   3 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  20 +-
 llvm/test/CodeGen/AArch64/sve-doublereduct.ll |  16 +-
 llvm/test/CodeGen/AArch64/sve2-uaddl.ll       | 636 ++++++++++++++++++
 5 files changed, 956 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-uaddl.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7947d73f9a4dd0..fa8ec9b7a55f21 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -49,6 +49,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -104,6 +105,7 @@
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
+namespace sd = llvm::SDPatternMatch;
 
 #define DEBUG_TYPE "aarch64-lower"
 
@@ -1416,6 +1418,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::OR, VT, Custom);
     }
 
+    // Illegal wide integer scalable vector types.
+    if (Subtarget->hasSVE2orSME()) {
+      for (auto VT : {MVT::nxv16i16, MVT::nxv16i32, MVT::nxv16i64})
+        setOperationAction(ISD::ADD, VT, Custom);
+      for (auto VT : {MVT::nxv8i32, MVT::nxv8i64})
+        setOperationAction(ISD::ADD, VT, Custom);
+      setOperationAction(ISD::ADD, MVT::nxv4i64, Custom);
+    }
+
     // Illegal unpacked integer vector types.
     for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -2725,6 +2736,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CTTZ_ELTS)
     MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
     MAKE_CASE(AArch64ISD::URSHR_I_PRED)
+    MAKE_CASE(AArch64ISD::UADDLB)
+    MAKE_CASE(AArch64ISD::UADDLT)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -25081,6 +25094,282 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
 }
 
+static bool matchUADDLOps(SDNode *N, SelectionDAG &DAG, SDValue &A, SDValue &B,
+                          unsigned &BotOpc, unsigned &TopOpc) {
+  BotOpc = AArch64ISD::UADDLB;
+  TopOpc = AArch64ISD::UADDLT;
+  if (sd_match(N, sd::m_Add(sd::m_OneUse(sd::m_ZExt(sd::m_Value(A))),
+                            sd::m_OneUse(sd::m_ZExt(sd::m_Value(B))))))
+
+    return true;
+
+#if 0
+  // Extended loads.
+  if (sd_match(N, sd::m_Add(sd::m_OneUse(sd::m_ZExt(sd::m_Value(A))),
+                            sd::m_OneUse(sd::m_Value(B))))) {
+    auto *LDB = dyn_cast<LoadSDNode>(B);
+    if (LDB && LDB->getExtensionType() == ISD::ZEXTLOAD) {
+      B = DAG.getLoad(LDB->getMemoryVT(), SDLoc(LDB), LDB->getChain(),
+                      LDB->getBasePtr(), LDB->getMemOperand());
+      return true;
+    }
+  } else if (sd_match(N, sd::m_Add(sd::m_OneUse(sd::m_Value(A)),
+                                   sd::m_OneUse(sd::m_Value(B)))) &&
+             isa<LoadSDNode>(A) && isa<LoadSDNode>(B)) {
+    auto *LDA = cast<LoadSDNode>(A);
+    auto *LDB = cast<LoadSDNode>(B);
+    if (LDA->getExtensionType() == ISD::ZEXTLOAD &&
+        LDB->getExtensionType() == ISD::ZEXTLOAD) {
+      A = DAG.getLoad(LDA->getMemoryVT(), SDLoc(LDA), LDA->getChain(),
+                      LDA->getBasePtr(), LDA->getMemOperand());
+      B = DAG.getLoad(LDB->getMemoryVT(), SDLoc(LDB), LDB->getChain(),
+                      LDB->getBasePtr(), LDB->getMemOperand());
+      return true;
+    }
+  }
+#endif
+  return false;
+}
+static bool replaceIntOpWithSVE2LongOp(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG,
+                                       const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasSVE2orSME())
+    return false;
+
+  EVT VT = N->getValueType(0);
+  LLVMContext &Ctx = *DAG.getContext();
+  SDLoc DL(N);
+  SDValue LHS, RHS;
+  unsigned BotOpc, TopOpc;
+
+  auto CreateLongOpPair = [&](SDValue LHS,
+                              SDValue RHS) -> std::pair<SDValue, SDValue> {
+    EVT WideResVT = LHS.getValueType()
+                        .widenIntegerVectorElementType(Ctx)
+                        .getHalfNumVectorElementsVT(Ctx);
+    SDValue Even = DAG.getNode(BotOpc, DL, WideResVT, LHS, RHS);
+    SDValue Odd = DAG.getNode(TopOpc, DL, WideResVT, LHS, RHS);
+    return std::make_pair(Even, Odd);
+  };
+
+  bool MatchedLongOp = matchUADDLOps(N, DAG, LHS, RHS, BotOpc, TopOpc);
+  // Should also work for similar long instructions.
+  // if (!MatchedLongOp) MatchedLongOp = match<OtherLongInstr>Ops(...);
+  if (!MatchedLongOp || LHS.getValueType() != RHS.getValueType())
+    return false;
+  EVT UnExtVT = LHS.getValueType();
+
+  // 128-bit unextended operands.
+  if (UnExtVT == MVT::nxv16i8 || UnExtVT == MVT::nxv8i16 ||
+      UnExtVT == MVT::nxv4i32) {
+    auto [Even, Odd] = CreateLongOpPair(LHS, RHS);
+    EVT WideResVT = Even.getValueType();
+    // Widening operations deinterleaves the results. Shuffle them to get
+    // their natural order.
+    SDValue Interleave =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), Even, Odd);
+    SDValue Concat = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        Interleave.getValue(0), Interleave.getValue(1));
+    Results.push_back(DAG.getZExtOrTrunc(Concat, DL, VT));
+    return true;
+  }
+
+  // 256-bit/512-bit unextended operands. Try to optimize by reducing the number
+  // of shuffles in cases where the operands are interleaved from existing
+  // even/odd pairs.
+  if (UnExtVT == MVT::nxv16i16 || UnExtVT == MVT::nxv8i32) {
+    // For the pattern:
+    //   (LHSBot, LHSTop) = vector_interleave(LHSEven, LHSOdd)
+    //   (RHSBot, RHSTop) = vector_interleave(RHSEven, RHSOdd)
+    //   LHS = concat(LHSBot, LHSTop)
+    //   RHS = concat(RHSBot, RHSTop)
+    //   op(zext(LHS), zext(RHS))
+    // We can use the pre-interleaved operands to create the longOp(b|t) and
+    // push the shuffles across the operation.
+    SDValue LHSBot, LHSTop, RHSBot, RHSTop;
+    SDValue LHSEven, LHSOdd, RHSEven, RHSOdd;
+
+    if (!sd_match(LHS, sd::m_Node(ISD::CONCAT_VECTORS, sd::m_Value(LHSBot),
+                                  sd::m_Value(LHSTop))))
+      return false;
+    if (LHSTop.getNode() != LHSBot.getNode() || LHSTop == LHSBot ||
+        !sd_match(LHSBot.getNode(),
+                  sd::m_Node(ISD::VECTOR_INTERLEAVE, sd::m_Value(LHSEven),
+                             sd::m_Value(LHSOdd))))
+      return false;
+
+    if (!sd_match(RHS, sd::m_Node(ISD::CONCAT_VECTORS, sd::m_Value(RHSBot),
+                                  sd::m_Value(RHSTop))))
+      return false;
+    if (RHSTop.getNode() != RHSBot.getNode() || RHSTop == RHSBot ||
+        !sd_match(RHSBot.getNode(),
+                  sd::m_Node(ISD::VECTOR_INTERLEAVE, sd::m_Value(RHSEven),
+                             sd::m_Value(RHSOdd))))
+      return false;
+
+    // Do the following:
+    //   v0 = longOpb(LHSEven, RHSEven)
+    //   v1 = longOpt(LHSEven, RHSEven)
+    //   v2 = longOpb(LHSOdd, RHSOdd)
+    //   v3 = longOpt(LHSOdd, RHSOdd)
+    //   InterleaveEven = interleave(v0, v2)
+    //   InterleaveOdd  = interleave(v1, v3)
+    //   concat(InterleaveEven[0], InterleaveOdd[0], InterleaveEven[1],
+    //   InterleaveOdd[1])
+    auto [V0, V1] = CreateLongOpPair(LHSEven, RHSEven);
+    auto [V2, V3] = CreateLongOpPair(LHSOdd, RHSOdd);
+    EVT WideResVT = V0.getValueType();
+
+    SDValue InterleaveEven =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), V0, V2);
+    SDValue InterleaveOdd =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), V1, V3);
+
+    SDValue Concat0 = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        InterleaveEven.getValue(0), InterleaveOdd.getValue(0));
+    SDValue Concat1 = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        InterleaveEven.getValue(1), InterleaveOdd.getValue(1));
+    SDValue Concat =
+        DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                    Concat0.getValueType().getDoubleNumVectorElementsVT(Ctx),
+                    Concat0, Concat1);
+    Results.push_back(DAG.getZExtOrTrunc(Concat, DL, VT));
+    return true;
+  }
+
+  if (UnExtVT == MVT::nxv16i32) {
+    // [LHS0, LHS2] = interleave(...)
+    // [LHS1, LHS3] = interleave(...)
+    // LHS = concat(concat(LHS0, LHS1), concat(LHS2, LHS3))
+    // See comments for 256-bit unextended operands to understand
+    // where this pattern comes from.
+    // Example:
+    //   LHS = 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+    //   LHS0 = 3, 2, 1, 0
+    //   LHS1 = 7, 6, 5, 4
+    //   LHS2 = 11, 10, 9, 8
+    //   LHS3 = 15, 14, 13, 12
+    // After Deinterleaving/pre-interleaved values:
+    //   LHS0 = 10, 8, 2, 0
+    //   LHS1 = 14, 12, 6, 4
+    //   LHS2 = 11, 9, 3, 1
+    //   LHS3 = 15, 13, 7, 5
+
+    SDValue LHS0, LHS1, LHS2, LHS3;
+    SDValue RHS0, RHS1, RHS2, RHS3;
+    if (!sd_match(LHS,
+                  sd::m_Node(ISD::CONCAT_VECTORS,
+                             sd::m_Node(ISD::CONCAT_VECTORS, sd::m_Value(LHS0),
+                                        sd::m_Value(LHS1)),
+                             sd::m_Node(ISD::CONCAT_VECTORS, sd::m_Value(LHS2),
+                                        sd::m_Value(LHS3)))))
+      return false;
+    if (!sd_match(RHS,
+                  sd::m_Node(ISD::CONCAT_VECTORS,
+                             sd::m_Node(ISD::CONCAT_VECTORS, sd::m_Value(RHS0),
+                                        sd::m_Value(RHS1)),
+                             sd::m_Node(ISD::CONCAT_VECTORS, sd::m_Value(RHS2),
+                                        sd::m_Value(RHS3)))))
+      return false;
+
+    if (LHS0.getNode() != LHS2.getNode() || LHS0 == LHS2 ||
+        !sd_match(LHS0.getNode(),
+                  sd::m_Node(ISD::VECTOR_INTERLEAVE, sd::m_Value(LHS0),
+                             sd::m_Value(LHS2))))
+      return false;
+    if (LHS1.getNode() != LHS3.getNode() || LHS1 == LHS3 ||
+        !sd_match(LHS1.getNode(),
+                  sd::m_Node(ISD::VECTOR_INTERLEAVE, sd::m_Value(LHS1),
+                             sd::m_Value(LHS3))))
+      return false;
+
+    if (RHS0.getNode() != RHS2.getNode() || RHS0 == RHS2 ||
+        !sd_match(RHS0.getNode(),
+                  sd::m_Node(ISD::VECTOR_INTERLEAVE, sd::m_Value(RHS0),
+                             sd::m_Value(RHS2))))
+      return false;
+    if (RHS1.getNode() != RHS3.getNode() || RHS1 == RHS3 ||
+        !sd_match(RHS1.getNode(),
+                  sd::m_Node(ISD::VECTOR_INTERLEAVE, sd::m_Value(RHS1),
+                             sd::m_Value(RHS3))))
+      return false;
+
+    // After long operation:
+    //   v0 = 8, 0
+    //   v1 = 10, 2
+    //
+    //   v2 = 12, 4
+    //   v3 = 14, 6
+    //
+    //   v4 = 9, 1
+    //   v5 = 11, 3
+    //
+    //   v6 = 13, 5
+    //   v7 = 15, 7
+    auto [V0, V1] = CreateLongOpPair(LHS0, RHS0);
+    auto [V2, V3] = CreateLongOpPair(LHS1, RHS1);
+    auto [V4, V5] = CreateLongOpPair(LHS2, RHS2);
+    auto [V6, V7] = CreateLongOpPair(LHS3, RHS3);
+    EVT WideResVT = V0.getValueType();
+
+    // Now we can interleave and concat:
+    //   i0 = interleave(v0, v4) ; i0 = [(1, 0), (12, 8)]
+    //   i1 = interleave(v1, v5) ; i1 = [(3, 2), (11, 10)]
+    //   i2 = interleave(v2, v6) ; i2 = [(5, 4), (13, 12)]
+    //   i3 = interleave(v3, v7) ; i3 = [(7, 6), (15, 14)]
+    //   res = concat(i0[0], i1[0]...i0[1], i1[1]...)
+    SDValue Interleave0 =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), V0, V4);
+    SDValue Interleave1 =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), V1, V5);
+    SDValue Interleave2 =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), V2, V6);
+    SDValue Interleave3 =
+        DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                    DAG.getVTList(WideResVT, WideResVT), V3, V7);
+
+    SDValue Concat0 = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        Interleave0.getValue(0), Interleave1.getValue(0));
+    SDValue Concat1 = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        Interleave2.getValue(0), Interleave3.getValue(0));
+    SDValue Concat2 = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        Interleave0.getValue(1), Interleave1.getValue(1));
+    SDValue Concat3 = DAG.getNode(
+        ISD::CONCAT_VECTORS, DL, WideResVT.getDoubleNumVectorElementsVT(Ctx),
+        Interleave2.getValue(1), Interleave3.getValue(1));
+    Concat0 =
+        DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                    Concat0.getValueType().getDoubleNumVectorElementsVT(Ctx),
+                    Concat0, Concat1);
+    Concat2 =
+        DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                    Concat2.getValueType().getDoubleNumVectorElementsVT(Ctx),
+                    Concat2, Concat3);
+    Concat0 =
+        DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                    Concat0.getValueType().getDoubleNumVectorElementsVT(Ctx),
+                    Concat0, Concat2);
+
+    Results.push_back(DAG.getZExtOrTrunc(Concat0, DL, VT));
+    return true;
+  }
+
+  return false;
+}
+
 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                SelectionDAG &DAG,
                                const AArch64Subtarget *Subtarget) {
@@ -25429,6 +25718,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
     return;
   case ISD::ADD:
   case ISD::FADD:
+    if (replaceIntOpWithSVE2LongOp(N, Results, DAG, Subtarget))
+      return;
+
     ReplaceAddWithADDP(N, Results, DAG, Subtarget);
     return;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index db6e8a00d2fb5e..25f40b553b74f8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -220,6 +220,9 @@ enum NodeType : unsigned {
   URSHR_I,
   URSHR_I_PRED,
 
+  UADDLB,
+  UADDLT,
+
   // Vector narrowing shift by immediate (bottom)
   RSHRNB_I,
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 6972acd985cb9a..8f592cf0a5a3b5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3556,6 +3556,22 @@ let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in {
   defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
 } // End HasSVE2orSME, UseExperimentalZeroingPseudos
 
+def SDT_AArch64ArithLong_Unpred : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>,
+  SDTCisInt<0>, SDTCisInt<1>,
+  SDTCisOpSmallerThanOp<1, 0>
+]>;
+def AArch64uaddlb_node : SDNode<"AArch64ISD::UADDLB",  SDT_AArch64ArithLong_Unpred>;
+def AArch64uaddlt_node : SDNode<"AArch64ISD::UADDLT",  SDT_AArch64ArithLong_Unpred>;
+
+// TODO: lower the intrinsic to the isd node.
+def AArch64uaddlb : PatFrags<(ops node:$op1, node:$op2),
+                           [(int_aarch64_sve_uaddlb node:$op1, node:$op2),
+                            (AArch64uaddlb_node node:$op1, node:$op2)]>;
+def AArch64uaddlt : PatFrags<(ops node:$op1, node:$op2),
+                           [(int_aarch64_sve_uaddlt node:$op1, node:$op2),
+                            (AArch64uaddlt_node node:$op1, node:$op2)]>;
+
 let Predicates = [HasSVE2orSME] in {
   // SVE2 predicated shifts
   defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl",  "SQSHL_ZPZI",  int_aarch64_sve_sqshl>;
@@ -3567,8 +3583,8 @@ let Predicates = [HasSVE2orSME] in {
   // SVE2 integer add/subtract long
   defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
   defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>;
-  defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>;
-  defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>;
+  defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", AArch64uaddlb>;
+  defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", AArch64uaddlt>;
   defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>;
   defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>;
   defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>;
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb6547..6779a43738ce6d 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -126,17 +126,15 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: add_ext_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    uunpklo z4.h, z0.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpkhi z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z5.h, z2.b
+; CHECK-NEXT:    uaddlt z3.h, z0.b, z1.b
+; CHECK-NEXT:    uaddlb z0.h, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z2.b
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEXT:    add z1.h, z4.h, z3.h
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    add z1.h, z2.h, z5.h
+; CHECK-NEXT:    zip2 z4.h, z0.h, z3.h
+; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
+; CHECK-NEXT:    add z1.h, z2.h, z1.h
+; CHECK-NEXT:    add z0.h, z0.h, z4.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
diff --git a/llvm/test/CodeGen/AArch64/sve2-uaddl.ll b/llvm/test/CodeGen/AArch64/sve2-uaddl.ll
new file mode 100644
index 00000000000000..caca0db65839d3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-uaddl.ll
@@ -0,0 +1,636 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=SVE %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mattr=+sve2 -o - %s | FileCheck --check-prefix=SVE2 %s
+
+define <vscale x 16 x i16> @foo_noloadSt_scalable_16x8to16x16(
+; SVE-LABEL: foo_noloadSt_scalable_16x8to16x16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z2.h, z0.b
+; SVE-NEXT:    uunpklo z0.h, z0.b
+; SVE-NEXT:    uunpkhi z3.h, z1.b
+; SVE-NEXT:    uunpklo z1.h, z1.b
+; SVE-NEXT:    add z0.h, z1.h, z0.h
+; SVE-NEXT:    add z1.h, z3.h, z2.h
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: foo_noloadSt_scalable_16x8to16x16:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    uaddlt z2.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlb z1.h, z1.b, z0.b
+; SVE2-NEXT:    zip1 z0.h, z1.h, z2.h
+; SVE2-NEXT:    zip2 z1.h, z1.h, z2.h
+; SVE2-NEXT:    ret
+    <vscale x 16 x i8> %A,
+    <vscale x 16 x i8> %B
+    ) {
+  %1 = zext <vscale x 16 x i8> %A to <vscale x 16 x i16>
+  %2 = zext <vscale x 16 x i8> %B to <vscale x 16 x i16>
+  %add1 = add nuw nsw <vscale x 16 x i16> %2, %1
+  ret <vscale x 16 x i16> %add1
+}
+
+define <vscale x 16 x i32> @foo_noloadSt_scalable_16x8to16x32(
+; SVE-LABEL: foo_noloadSt_scalable_16x8to16x32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpklo z4.h, z1.b
+; SVE-NEXT:    uunpklo z5.h, z0.b
+; SVE-NEXT:    uunpkhi z0.h, z0.b
+; SVE-NEXT:    uunpkhi z1.h, z1.b
+; SVE-NEXT:    uunpklo z26.s, z2.h
+; SVE-NEXT:    uunpkhi z2.s, z2.h
+; SVE-NEXT:    uunpklo z6.s, z5.h
+; SVE-NEXT:    uunpklo z7.s, z4.h
+; SVE-NEXT:    uunpkhi z5.s, z5.h
+; SVE-NEXT:    uunpklo z24.s, z0.h
+; SVE-NEXT:    uunpkhi z0.s, z0.h
+; SVE-NEXT:    uunpkhi z4.s, z4.h
+; SVE-NEXT:    uunpklo z25.s, z1.h
+; SVE-NEXT:    uunpkhi z1.s, z1.h
+; SVE-NEXT:    add z6.s, z7.s, z6.s
+; SVE-NEXT:    uunpkhi z7.s, z3.h
+; SVE-NEXT:    uunpklo z3.s, z3.h
+; SVE-NEXT:    add z27.s, z1.s, z0.s
+; SVE-NEXT:    add z24.s, z25.s, z24.s
+; SVE-NEXT:    add z1.s, z4.s, z5.s
+; SVE-NEXT:    add z0.s, z6.s, z26.s
+; SVE-NEXT:    add z1.s, z1.s, z2.s
+; SVE-NEXT:    add z2.s, z24.s, z3.s
+; SVE-NEXT:    add z3.s, z27.s, z7.s
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: foo_noloadSt_scalable_16x8to16x32:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    uaddlt z4.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlb z0.h, z1.b, z0.b
+; SVE2-NEXT:    zip1 z1.h, z0.h, z4.h
+; SVE2-NEXT:    zip2 z0.h, z0.h, z4.h
+; SVE2-NEXT:    uaddlt z4.s, z1.h, z2.h
+; SVE2-NEXT:    uaddlb z1.s, z1.h, z2.h
+; SVE2-NEXT:    uaddlt z5.s, z0.h, z3.h
+; SVE2-NEXT:    uaddlb z3.s, z0.h, z3.h
+; SVE2-NEXT:    zip1 z0.s, z1.s, z4.s
+; SVE2-NEXT:    zip2 z1.s, z1.s, z4.s
+; SVE2-NEXT:    zip1 z2.s, z3.s, z5.s
+; SVE2-NEXT:    zip2 z3.s, z3.s, z5.s
+; SVE2-NEXT:    ret
+    <vscale x 16 x i8> %A,
+    <vscale x 16 x i8> %B,
+    <vscale x 16 x i16> %C
+    ) {
+  %1 = zext <vscale x 16 x i8> %A to <vscale x 16 x i32>
+  %2 = zext <vscale x 16 x i8> %B to <vscale x 16 x i32>
+  %add1 = add nuw nsw <vscale x 16 x i32> %2, %1
+
+  %3 = zext <vscale x 16 x i16> %C to <vscale x 16 x i32>
+  %add2 = add nuw nsw <vscale x 16 x i32> %add1, %3
+  ret <vscale x 16 x i32> %add2
+}
+
+define <vscale x 16 x i64>@foo_noloadSt_scalable_16x8to16x64(
+; SVE-LABEL: foo_noloadSt_scalable_16x8to16x64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; SVE-NEXT:    addvl sp, sp, #-12
+; SVE-NEXT:    str z19, [sp] // 16-byte Folded Spill
+; SVE-NEXT:    str z18, [sp, #1, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z17, [sp, #2, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z16, [sp, #3, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z15, [sp, #4, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z14, [sp, #5, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z13, [sp, #6, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z12, [sp, #7, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z11, [sp, #8, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z10, [sp, #9, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z9, [sp, #10, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z8, [sp, #11, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 96 * VG
+; SVE-NEXT:    .cfi_offset w29, -16
+; SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; SVE-NEXT:    uunpkhi z25.h, z0.b
+; SVE-NEXT:    uunpklo z0.h, z0.b
+; SVE-NEXT:    uunpklo z26.h, z1.b
+; SVE-NEXT:    uunpkhi z1.h, z1.b
+; SVE-NEXT:    uunpklo z24.s, z2.h
+; SVE-NEXT:    uunpkhi z2.s, z2.h
+; SVE-NEXT:    uunpklo z15.s, z3.h
+; SVE-NEXT:    uunpkhi z3.s, z3.h
+; SVE-NEXT:    uunpkhi z16.d, z4.s
+; SVE-NEXT:    uunpklo z4.d, z4.s
+; SVE-NEXT:    uunpklo z17.d, z6.s
+; SVE-NEXT:    uunpkhi z18.d, z5.s
+; SVE-NEXT:    uunpkhi z28.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpklo z29.s, z26.h
+; SVE-NEXT:    uunpkhi z26.s, z26.h
+; SVE-NEXT:    uunpkhi z27.s, z25.h
+; SVE-NEXT:    uunpklo z25.s, z25.h
+; SVE-NEXT:    uunpkhi z30.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    uunpklo z5.d, z5.s
+; SVE-NEXT:    uunpkhi z19.d, z7.s
+; SVE-NEXT:    uunpklo z7.d, z7.s
+; SVE-NEXT:    uunpkhi z6.d, z6.s
+; SVE-NEXT:    uunpkhi z9.d, z28.s
+; SVE-NEXT:    uunpkhi z10.d, z0.s
+; SVE-NEXT:    uunpklo z0.d, z0.s
+; SVE-NEXT:    uunpkhi z11.d, z29.s
+; SVE-NEXT:    uunpklo z29.d, z29.s
+; SVE-NEXT:    uunpklo z28.d, z28.s
+; SVE-NEXT:    uunpkhi z12.d, z26.s
+; SVE-NEXT:    uunpklo z26.d, z26.s
+; SVE-NEXT:    uunpkhi z31.d, z27.s
+; SVE-NEXT:    uunpklo z27.d, z27.s
+; SVE-NEXT:    uunpkhi z8.d, z25.s
+; SVE-NEXT:    uunpklo z25.d, z25.s
+; SVE-NEXT:    uunpkhi z13.d, z30.s
+; SVE-NEXT:    uunpklo z30.d, z30.s
+; SVE-NEXT:    uunpkhi z14.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z1.s
+; SVE-NEXT:    add z0.d, z29.d, z0.d
+; SVE-NEXT:    add z29.d, z11.d, z10.d
+; SVE-NEXT:    add z26.d, z26.d, z28.d
+; SVE-NEXT:    add z28.d, z12.d, z9.d
+; SVE-NEXT:    uunpklo z9.d, z24.s
+; SVE-NEXT:    uunpklo z10.d, z2.s
+; SVE-NEXT:    uunpklo z11.d, z15.s
+; SVE-NEXT:    uunpkhi z12.d, z15.s
+; SVE-NEXT:    uunpklo z15.d, z3.s
+; SVE-NEXT:    uunpkhi z3.d, z3.s
+; SVE-NEXT:    uunpkhi z24.d, z24.s
+; SVE-NEXT:    uunpkhi z2.d, z2.s
+; SVE-NEXT:    add z25.d, z1.d, z25.d
+; SVE-NEXT:    add z8.d, z14.d, z8.d
+; SVE-NEXT:    ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z27.d, z30.d, z27.d
+; SVE-NEXT:    add z30.d, z13.d, z31.d
+; SVE-NEXT:    ldr z13, [sp, #6, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z1.d, z9.d, z4.d
+; SVE-NEXT:    add z5.d, z10.d, z5.d
+; SVE-NEXT:    ldr z10, [sp, #9, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z6.d, z12.d, z6.d
+; SVE-NEXT:    ldr z12, [sp, #7, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z7.d, z15.d, z7.d
+; SVE-NEXT:    ldr z15, [sp, #4, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z9.d, z3.d, z19.d
+; SVE-NEXT:    ldr z19, [sp] // 16-byte Folded Reload
+; SVE-NEXT:    add z4.d, z24.d, z16.d
+; SVE-NEXT:    ldr z16, [sp, #3, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z24.d, z2.d, z18.d
+; SVE-NEXT:    ldr z18, [sp, #1, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z31.d, z11.d, z17.d
+; SVE-NEXT:    ldr z17, [sp, #2, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z11, [sp, #8, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z2.d, z26.d, z5.d
+; SVE-NEXT:    add z5.d, z8.d, z6.d
+; SVE-NEXT:    ldr z8, [sp, #11, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z6.d, z27.d, z7.d
+; SVE-NEXT:    add z7.d, z30.d, z9.d
+; SVE-NEXT:    ldr z9, [sp, #10, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z0.d, z0.d, z1.d
+; SVE-NEXT:    add z1.d, z29.d, z4.d
+; SVE-NEXT:    add z3.d, z28.d, z24.d
+; SVE-NEXT:    add z4.d, z25.d, z31.d
+; SVE-NEXT:    addvl sp, sp, #12
+; SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: foo_noloadSt_scalable_16x8to16x64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    uaddlt z24.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlb z0.h, z1.b, z0.b
+; SVE2-NEXT:    zip1 z1.h, z0.h, z24.h
+; SVE2-NEXT:    zip2 z0.h, z0.h, z24.h
+; SVE2-NEXT:    uaddlt z24.s, z1.h, z2.h
+; SVE2-NEXT:    uaddlb z1.s, z1.h, z2.h
+; SVE2-NEXT:    uaddlt z2.s, z0.h, z3.h
+; SVE2-NEXT:    uaddlb z0.s, z0.h, z3.h
+; SVE2-NEXT:    zip2 z3.s, z1.s, z24.s
+; SVE2-NEXT:    zip1 z1.s, z1.s, z24.s
+; SVE2-NEXT:    zip1 z24.s, z0.s, z2.s
+; SVE2-NEXT:    zip2 z0.s, z0.s, z2.s
+; SVE2-NEXT:    uaddlt z2.d, z1.s, z4.s
+; SVE2-NEXT:    uaddlb z1.d, z1.s, z4.s
+; SVE2-NEXT:    uaddlt z4.d, z3.s, z5.s
+; SVE2-NEXT:    uaddlb z3.d, z3.s, z5.s
+; SVE2-NEXT:    uaddlt z5.d, z24.s, z6.s
+; SVE2-NEXT:    uaddlb z6.d, z24.s, z6.s
+; SVE2-NEXT:    uaddlt z24.d, z0.s, z7.s
+; SVE2-NEXT:    uaddlb z7.d, z0.s, z7.s
+; SVE2-NEXT:    zip1 z0.d, z1.d, z2.d
+; SVE2-NEXT:    zip2 z1.d, z1.d, z2.d
+; SVE2-NEXT:    zip1 z2.d, z3.d, z4.d
+; SVE2-NEXT:    zip2 z3.d, z3.d, z4.d
+; SVE2-NEXT:    zip1 z4.d, z6.d, z5.d
+; SVE2-NEXT:    zip2 z5.d, z6.d, z5.d
+; SVE2-NEXT:    zip1 z6.d, z7.d, z24.d
+; SVE2-NEXT:    zip2 z7.d, z7.d, z24.d
+; SVE2-NEXT:    ret
+    <vscale x 16 x i8> %A,
+    <vscale x 16 x i8> %B,
+    <vscale x 16 x i16> %C,
+    <vscale x 16 x i32> %D
+    ) {
+  %1 = zext <vscale x 16 x i8> %A to <vscale x 16 x i64>
+  %2 = zext <vscale x 16 x i8> %B to <vscale x 16 x i64>
+  %add1 = add nuw nsw <vscale x 16 x i64> %2, %1
+
+  %3 = zext <vscale x 16 x i16> %C to <vscale x 16 x i64>
+  %add2 = add nuw nsw <vscale x 16 x i64> %add1, %3
+
+  %4 = zext <vscale x 16 x i32> %D to <vscale x 16 x i64>
+  %add3 = add nuw nsw <vscale x 16 x i64> %add2, %4
+
+  ret <vscale x 16 x i64> %add3
+}
+
+define <vscale x 16 x i32> @addlong_tree_noloadSt_scalable_16x8to16x32(
+; SVE-LABEL: addlong_tree_noloadSt_scalable_16x8to16x32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; SVE-NEXT:    addvl sp, sp, #-1
+; SVE-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; SVE-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; SVE-NEXT:    .cfi_offset w29, -16
+; SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; SVE-NEXT:    uunpklo z4.h, z0.b
+; SVE-NEXT:    uunpkhi z0.h, z0.b
+; SVE-NEXT:    uunpklo z5.h, z1.b
+; SVE-NEXT:    uunpkhi z1.h, z1.b
+; SVE-NEXT:    uunpklo z6.h, z2.b
+; SVE-NEXT:    uunpkhi z2.h, z2.b
+; SVE-NEXT:    uunpklo z7.h, z3.b
+; SVE-NEXT:    uunpkhi z3.h, z3.b
+; SVE-NEXT:    uunpklo z25.s, z0.h
+; SVE-NEXT:    uunpkhi z0.s, z0.h
+; SVE-NEXT:    uunpklo z24.s, z4.h
+; SVE-NEXT:    uunpklo z27.s, z1.h
+; SVE-NEXT:    uunpkhi z1.s, z1.h
+; SVE-NEXT:    uunpklo z29.s, z2.h
+; SVE-NEXT:    uunpkhi z2.s, z2.h
+; SVE-NEXT:    uunpklo z31.s, z3.h
+; SVE-NEXT:    uunpkhi z3.s, z3.h
+; SVE-NEXT:    uunpkhi z4.s, z4.h
+; SVE-NEXT:    uunpklo z26.s, z5.h
+; SVE-NEXT:    uunpkhi z5.s, z5.h
+; SVE-NEXT:    uunpklo z28.s, z6.h
+; SVE-NEXT:    uunpkhi z6.s, z6.h
+; SVE-NEXT:    uunpklo z30.s, z7.h
+; SVE-NEXT:    uunpkhi z7.s, z7.h
+; SVE-NEXT:    add z8.s, z1.s, z0.s
+; SVE-NEXT:    add z25.s, z27.s, z25.s
+; SVE-NEXT:    add z3.s, z3.s, z2.s
+; SVE-NEXT:    add z2.s, z31.s, z29.s
+; SVE-NEXT:    add z1.s, z5.s, z4.s
+; SVE-NEXT:    add z0.s, z26.s, z24.s
+; SVE-NEXT:    add z4.s, z30.s, z28.s
+; SVE-NEXT:    add z5.s, z7.s, z6.s
+; SVE-NEXT:    add z3.s, z8.s, z3.s
+; SVE-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; SVE-NEXT:    add z2.s, z25.s, z2.s
+; SVE-NEXT:    add z0.s, z0.s, z4.s
+; SVE-NEXT:    add z1.s, z1.s, z5.s
+; SVE-NEXT:    addvl sp, sp, #1
+; SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: addlong_tree_noloadSt_scalable_16x8to16x32:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    uaddlb z4.h, z3.b, z2.b
+; SVE2-NEXT:    uaddlb z5.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlt z0.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlt z1.h, z3.b, z2.b
+; SVE2-NEXT:    uaddlb z3.s, z5.h, z4.h
+; SVE2-NEXT:    uaddlt z4.s, z5.h, z4.h
+; SVE2-NEXT:    uaddlb z2.s, z0.h, z1.h
+; SVE2-NEXT:    uaddlt z6.s, z0.h, z1.h
+; SVE2-NEXT:    zip1 z0.s, z3.s, z2.s
+; SVE2-NEXT:    zip1 z1.s, z4.s, z6.s
+; SVE2-NEXT:    zip2 z2.s, z3.s, z2.s
+; SVE2-NEXT:    zip2 z3.s, z4.s, z6.s
+; SVE2-NEXT:    ret
+    <vscale x 16 x i8> %A,
+    <vscale x 16 x i8> %B,
+    <vscale x 16 x i8> %C,
+    <vscale x 16 x i8> %D
+    ) {
+  %1 = zext <vscale x 16 x i8> %A to <vscale x 16 x i32>
+  %2 = zext <vscale x 16 x i8> %B to <vscale x 16 x i32>
+  %add1 = add nuw nsw <vscale x 16 x i32> %2, %1
+
+  %a1 = zext <vscale x 16 x i8> %C to <vscale x 16 x i32>
+  %a2 = zext <vscale x 16 x i8> %D to <vscale x 16 x i32>
+  %add2 = add nuw nsw <vscale x 16 x i32> %a2, %a1
+
+  %add3 = add nuw nsw <vscale x 16 x i32> %add1, %add2
+  ret <vscale x 16 x i32> %add3
+}
+
+
+define <vscale x 16 x i64> @addlong_tree_noloadSt_scalable_16x8to16x64(
+; SVE-LABEL: addlong_tree_noloadSt_scalable_16x8to16x64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; SVE-NEXT:    addvl sp, sp, #-16
+; SVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; SVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    addvl sp, sp, #-5
+; SVE-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 168 * VG
+; SVE-NEXT:    .cfi_offset w29, -16
+; SVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; SVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; SVE-NEXT:    uunpkhi z24.h, z0.b
+; SVE-NEXT:    uunpklo z0.h, z0.b
+; SVE-NEXT:    uunpklo z26.h, z1.b
+; SVE-NEXT:    uunpkhi z1.h, z1.b
+; SVE-NEXT:    uunpkhi z25.h, z2.b
+; SVE-NEXT:    uunpklo z15.h, z2.b
+; SVE-NEXT:    uunpkhi z17.h, z3.b
+; SVE-NEXT:    uunpklo z18.h, z3.b
+; SVE-NEXT:    uunpkhi z21.h, z4.b
+; SVE-NEXT:    uunpkhi z28.s, z0.h
+; SVE-NEXT:    uunpklo z0.s, z0.h
+; SVE-NEXT:    uunpklo z29.s, z26.h
+; SVE-NEXT:    uunpkhi z26.s, z26.h
+; SVE-NEXT:    uunpkhi z27.s, z24.h
+; SVE-NEXT:    uunpklo z24.s, z24.h
+; SVE-NEXT:    uunpkhi z30.s, z1.h
+; SVE-NEXT:    uunpklo z31.s, z1.h
+; SVE-NEXT:    uunpkhi z10.d, z28.s
+; SVE-NEXT:    uunpkhi z11.d, z0.s
+; SVE-NEXT:    uunpklo z0.d, z0.s
+; SVE-NEXT:    uunpkhi z12.d, z29.s
+; SVE-NEXT:    uunpklo z29.d, z29.s
+; SVE-NEXT:    uunpkhi z13.d, z26.s
+; SVE-NEXT:    uunpkhi z1.d, z27.s
+; SVE-NEXT:    uunpklo z8.d, z27.s
+; SVE-NEXT:    uunpkhi z9.d, z24.s
+; SVE-NEXT:    uunpklo z27.d, z24.s
+; SVE-NEXT:    uunpklo z28.d, z28.s
+; SVE-NEXT:    uunpklo z14.d, z26.s
+; SVE-NEXT:    uunpkhi z24.d, z30.s
+; SVE-NEXT:    uunpklo z30.d, z30.s
+; SVE-NEXT:    uunpklo z16.d, z31.s
+; SVE-NEXT:    uunpkhi z31.d, z31.s
+; SVE-NEXT:    add z0.d, z29.d, z0.d
+; SVE-NEXT:    add z26.d, z12.d, z11.d
+; SVE-NEXT:    add z3.d, z13.d, z10.d
+; SVE-NEXT:    uunpkhi z29.s, z25.h
+; SVE-NEXT:    uunpklo z11.s, z15.h
+; SVE-NEXT:    uunpkhi z12.s, z17.h
+; SVE-NEXT:    uunpklo z13.s, z18.h
+; SVE-NEXT:    add z2.d, z14.d, z28.d
+; SVE-NEXT:    str z0, [sp, #4, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    add z27.d, z16.d, z27.d
+; SVE-NEXT:    add z30.d, z30.d, z8.d
+; SVE-NEXT:    add z28.d, z31.d, z9.d
+; SVE-NEXT:    uunpkhi z31.s, z18.h
+; SVE-NEXT:    uunpklo z9.s, z17.h
+; SVE-NEXT:    uunpkhi z8.d, z29.s
+; SVE-NEXT:    uunpkhi z16.d, z11.s
+; SVE-NEXT:    uunpklo z11.d, z11.s
+; SVE-NEXT:    uunpkhi z17.d, z12.s
+; SVE-NEXT:    uunpklo z18.d, z13.s
+; SVE-NEXT:    uunpklo z25.s, z25.h
+; SVE-NEXT:    uunpkhi z10.s, z15.h
+; SVE-NEXT:    uunpklo z29.d, z29.s
+; SVE-NEXT:    uunpklo z12.d, z12.s
+; SVE-NEXT:    add z24.d, z24.d, z1.d
+; SVE-NEXT:    uunpklo z19.d, z31.s
+; SVE-NEXT:    uunpkhi z13.d, z13.s
+; SVE-NEXT:    uunpkhi z20.d, z9.s
+; SVE-NEXT:    uunpklo z9.d, z9.s
+; SVE-NEXT:    uunpkhi z31.d, z31.s
+; SVE-NEXT:    add z0.d, z18.d, z11.d
+; SVE-NEXT:    add z8.d, z17.d, z8.d
+; SVE-NEXT:    uunpkhi z14.d, z25.s
+; SVE-NEXT:    uunpklo z15.d, z10.s
+; SVE-NEXT:    uunpklo z25.d, z25.s
+; SVE-NEXT:    uunpkhi z10.d, z10.s
+; SVE-NEXT:    add z29.d, z12.d, z29.d
+; SVE-NEXT:    add z11.d, z13.d, z16.d
+; SVE-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    add z0.d, z24.d, z8.d
+; SVE-NEXT:    uunpkhi z8.s, z21.h
+; SVE-NEXT:    add z12.d, z20.d, z14.d
+; SVE-NEXT:    uunpklo z14.h, z4.b
+; SVE-NEXT:    uunpklo z20.h, z7.b
+; SVE-NEXT:    add z13.d, z19.d, z15.d
+; SVE-NEXT:    uunpklo z15.h, z5.b
+; SVE-NEXT:    add z9.d, z9.d, z25.d
+; SVE-NEXT:    str z0, [sp, #3, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    add z31.d, z31.d, z10.d
+; SVE-NEXT:    add z0.d, z30.d, z29.d
+; SVE-NEXT:    add z26.d, z26.d, z11.d
+; SVE-NEXT:    uunpkhi z10.h, z5.b
+; SVE-NEXT:    uunpkhi z7.h, z7.b
+; SVE-NEXT:    add z25.d, z27.d, z9.d
+; SVE-NEXT:    uunpklo z9.s, z14.h
+; SVE-NEXT:    add z5.d, z2.d, z13.d
+; SVE-NEXT:    str z0, [sp, #2, mul vl] // 16-byte Folded Spill
+; SVE-NEXT:    add z0.d, z28.d, z12.d
+; SVE-NEXT:    add z24.d, z3.d, z31.d
+; SVE-NEXT:    uunpkhi z31.s, z14.h
+; SVE-NEXT:    uunpklo z11.s, z15.h
+; SVE-NEXT:    uunpkhi z12.s, z15.h
+; SVE-NEXT:    uunpkhi z14.h, z6.b
+; SVE-NEXT:    uunpklo z6.h, z6.b
+; SVE-NEXT:    uunpklo z30.s, z21.h
+; SVE-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; SVE-NEXT:    uunpkhi z15.d, z9.s
+; SVE-NEXT:    uunpklo z9.d, z9.s
+; SVE-NEXT:    uunpkhi z27.d, z8.s
+; SVE-NEXT:    uunpklo z28.d, z8.s
+; SVE-NEXT:    uunpkhi z8.s, z10.h
+; SVE-NEXT:    uunpkhi z13.d, z31.s
+; SVE-NEXT:    uunpklo z31.d, z31.s
+; SVE-NEXT:    uunpkhi z17.d, z11.s
+; SVE-NEXT:    uunpklo z11.d, z11.s
+; SVE-NEXT:    uunpklo z18.d, z12.s
+; SVE-NEXT:    uunpkhi z12.d, z12.s
+; SVE-NEXT:    uunpkhi z19.s, z14.h
+; SVE-NEXT:    uunpklo z10.s, z10.h
+; SVE-NEXT:    uunpklo z14.s, z14.h
+; SVE-NEXT:    uunpkhi z29.d, z30.s
+; SVE-NEXT:    uunpklo z30.d, z30.s
+; SVE-NEXT:    uunpkhi z16.d, z8.s
+; SVE-NEXT:    uunpklo z8.d, z8.s
+; SVE-NEXT:    add z9.d, z11.d, z9.d
+; SVE-NEXT:    add z11.d, z17.d, z15.d
+; SVE-NEXT:    add z31.d, z18.d, z31.d
+; SVE-NEXT:    uunpkhi z15.s, z6.h
+; SVE-NEXT:    uunpklo z6.s, z6.h
+; SVE-NEXT:    add z12.d, z12.d, z13.d
+; SVE-NEXT:    uunpkhi z13.d, z19.s
+; SVE-NEXT:    uunpklo z17.s, z20.h
+; SVE-NEXT:    uunpklo z18.d, z19.s
+; SVE-NEXT:    uunpklo z19.s, z7.h
+; SVE-NEXT:    uunpkhi z7.s, z7.h
+; SVE-NEXT:    uunpkhi z20.s, z20.h
+; SVE-NEXT:    uunpkhi z21.d, z10.s
+; SVE-NEXT:    uunpklo z10.d, z10.s
+; SVE-NEXT:    uunpkhi z22.d, z14.s
+; SVE-NEXT:    uunpkhi z4.d, z6.s
+; SVE-NEXT:    uunpklo z6.d, z6.s
+; SVE-NEXT:    uunpklo z14.d, z14.s
+; SVE-NEXT:    uunpkhi z2.d, z17.s
+; SVE-NEXT:    uunpklo z17.d, z17.s
+; SVE-NEXT:    uunpkhi z23.d, z15.s
+; SVE-NEXT:    uunpkhi z3.d, z7.s
+; SVE-NEXT:    uunpklo z15.d, z15.s
+; SVE-NEXT:    uunpkhi z1.d, z20.s
+; SVE-NEXT:    uunpklo z20.d, z20.s
+; SVE-NEXT:    uunpklo z0.d, z19.s
+; SVE-NEXT:    uunpklo z7.d, z7.s
+; SVE-NEXT:    add z30.d, z10.d, z30.d
+; SVE-NEXT:    ldr z10, [sp, #4, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    uunpkhi z19.d, z19.s
+; SVE-NEXT:    add z6.d, z17.d, z6.d
+; SVE-NEXT:    add z28.d, z8.d, z28.d
+; SVE-NEXT:    add z2.d, z2.d, z4.d
+; SVE-NEXT:    add z3.d, z3.d, z13.d
+; SVE-NEXT:    ldr z13, [sp, #1, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z29.d, z21.d, z29.d
+; SVE-NEXT:    add z4.d, z20.d, z15.d
+; SVE-NEXT:    add z0.d, z0.d, z14.d
+; SVE-NEXT:    add z7.d, z7.d, z18.d
+; SVE-NEXT:    add z6.d, z9.d, z6.d
+; SVE-NEXT:    add z27.d, z16.d, z27.d
+; SVE-NEXT:    add z1.d, z1.d, z23.d
+; SVE-NEXT:    add z10.d, z10.d, z13.d
+; SVE-NEXT:    add z8.d, z19.d, z22.d
+; SVE-NEXT:    add z2.d, z11.d, z2.d
+; SVE-NEXT:    add z30.d, z30.d, z0.d
+; SVE-NEXT:    add z4.d, z31.d, z4.d
+; SVE-NEXT:    add z7.d, z28.d, z7.d
+; SVE-NEXT:    add z9.d, z12.d, z1.d
+; SVE-NEXT:    add z27.d, z27.d, z3.d
+; SVE-NEXT:    add z0.d, z10.d, z6.d
+; SVE-NEXT:    ldr z6, [sp, #2, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z28.d, z29.d, z8.d
+; SVE-NEXT:    add z1.d, z26.d, z2.d
+; SVE-NEXT:    add z2.d, z5.d, z4.d
+; SVE-NEXT:    ldr z5, [sp] // 16-byte Folded Reload
+; SVE-NEXT:    add z3.d, z24.d, z9.d
+; SVE-NEXT:    add z4.d, z25.d, z30.d
+; SVE-NEXT:    add z6.d, z6.d, z7.d
+; SVE-NEXT:    ldr z7, [sp, #3, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    add z5.d, z5.d, z28.d
+; SVE-NEXT:    add z7.d, z7.d, z27.d
+; SVE-NEXT:    addvl sp, sp, #5
+; SVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; SVE-NEXT:    addvl sp, sp, #16
+; SVE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; SVE-NEXT:    ret
+;
+; SVE2-LABEL: addlong_tree_noloadSt_scalable_16x8to16x64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    uaddlt z24.h, z3.b, z2.b
+; SVE2-NEXT:    uaddlt z25.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlb z0.h, z1.b, z0.b
+; SVE2-NEXT:    uaddlb z1.h, z3.b, z2.b
+; SVE2-NEXT:    uaddlt z2.h, z5.b, z4.b
+; SVE2-NEXT:    uaddlb z3.h, z5.b, z4.b
+; SVE2-NEXT:    uaddlt z4.h, z7.b, z6.b
+; SVE2-NEXT:    uaddlb z5.h, z7.b, z6.b
+; SVE2-NEXT:    uaddlb z7.s, z25.h, z24.h
+; SVE2-NEXT:    uaddlt z24.s, z25.h, z24.h
+; SVE2-NEXT:    uaddlb z6.s, z0.h, z1.h
+; SVE2-NEXT:    uaddlt z0.s, z0.h, z1.h
+; SVE2-NEXT:    uaddlb z26.s, z3.h, z5.h
+; SVE2-NEXT:    uaddlb z27.s, z2.h, z4.h
+; SVE2-NEXT:    uaddlt z2.s, z2.h, z4.h
+; SVE2-NEXT:    uaddlt z1.s, z3.h, z5.h
+; SVE2-NEXT:    uaddlb z4.d, z7.s, z27.s
+; SVE2-NEXT:    uaddlb z5.d, z6.s, z26.s
+; SVE2-NEXT:    uaddlt z7.d, z7.s, z27.s
+; SVE2-NEXT:    uaddlt z6.d, z6.s, z26.s
+; SVE2-NEXT:    uaddlb z25.d, z24.s, z2.s
+; SVE2-NEXT:    uaddlb z26.d, z0.s, z1.s
+; SVE2-NEXT:    uaddlt z24.d, z24.s, z2.s
+; SVE2-NEXT:    uaddlt z27.d, z0.s, z1.s
+; SVE2-NEXT:    zip1 z0.d, z5.d, z4.d
+; SVE2-NEXT:    zip2 z4.d, z5.d, z4.d
+; SVE2-NEXT:    zip1 z1.d, z6.d, z7.d
+; SVE2-NEXT:    zip1 z2.d, z26.d, z25.d
+; SVE2-NEXT:    zip2 z5.d, z6.d, z7.d
+; SVE2-NEXT:    zip1 z3.d, z27.d, z24.d
+; SVE2-NEXT:    zip2 z6.d, z26.d, z25.d
+; SVE2-NEXT:    zip2 z7.d, z27.d, z24.d
+; SVE2-NEXT:    ret
+    <vscale x 16 x i8> %A,
+    <vscale x 16 x i8> %B,
+    <vscale x 16 x i8> %C,
+    <vscale x 16 x i8> %D,
+    <vscale x 16 x i8> %E,
+    <vscale x 16 x i8> %F,
+    <vscale x 16 x i8> %G,
+    <vscale x 16 x i8> %H
+    ) {
+
+  %1 = zext <vscale x 16 x i8> %A to <vscale x 16 x i64>
+  %2 = zext <vscale x 16 x i8> %B to <vscale x 16 x i64>
+  %add1 = add nuw nsw <vscale x 16 x i64> %2, %1
+
+  %a1 = zext <vscale x 16 x i8> %C to <vscale x 16 x i64>
+  %a2 = zext <vscale x 16 x i8> %D to <vscale x 16 x i64>
+  %add2 = add nuw nsw <vscale x 16 x i64> %a2, %a1
+
+  %add3 = add nuw nsw <vscale x 16 x i64> %add1, %add2
+
+
+  %a1a = zext <vscale x 16 x i8> %E to <vscale x 16 x i64>
+  %a2a = zext <vscale x 16 x i8> %F to <vscale x 16 x i64>
+  %add1a = add nuw nsw <vscale x 16 x i64> %a2a, %a1a
+
+  %aa1 = zext <vscale x 16 x i8> %G to <vscale x 16 x i64>
+  %aa2 = zext <vscale x 16 x i8> %H to <vscale x 16 x i64>
+  %add2a = add nuw nsw <vscale x 16 x i64> %aa2, %aa1
+
+  %add3a = add nuw nsw <vscale x 16 x i64> %add1a, %add2a
+
+
+  %add4 = add nuw nsw <vscale x 16 x i64> %add3, %add3a
+  ret <vscale x 16 x i64> %add4
+}