[llvm] [AArch64][GlobalISel] Combine vecreduce(ext) to {U/S}ADDLV (PR #75832)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 12 13:46:09 PST 2024


https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/75832

>From 02525891d3ae4c02b5a39624c0318d577591e9a4 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 12 Dec 2023 16:50:23 +0000
Subject: [PATCH 1/5] [GlobalISel] Refactor extractParts()

Moved extractParts() and extractVectorParts() from LegalizerHelper
to Utils to be able to use it in different passes.

extractParts() will also try to use unmerge when doing irregular
splits where possible, falling back to extract elements when not.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |  16 --
 llvm/include/llvm/CodeGen/GlobalISel/Utils.h  |  19 +++
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 150 ++++--------------
 llvm/lib/CodeGen/GlobalISel/Utils.cpp         | 143 +++++++++++++++++
 .../GISel/AArch64PreLegalizerCombiner.cpp     |  75 ++++-----
 .../GlobalISel/legalize-shuffle-vector.mir    |  33 +++-
 .../GlobalISel/legalize-store-global.mir      |  36 ++---
 7 files changed, 273 insertions(+), 199 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 586679fa295431..a7ecf0dc1ba216 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -187,22 +187,6 @@ class LegalizerHelper {
   LegalizeResult widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
                                  LLT WideTy);
 
-  /// Helper function to split a wide generic register into bitwise blocks with
-  /// the given Type (which implies the number of blocks needed). The generic
-  /// registers created are appended to Ops, starting at bit 0 of Reg.
-  void extractParts(Register Reg, LLT Ty, int NumParts,
-                    SmallVectorImpl<Register> &VRegs);
-
-  /// Version which handles irregular splits.
-  bool extractParts(Register Reg, LLT RegTy, LLT MainTy,
-                    LLT &LeftoverTy,
-                    SmallVectorImpl<Register> &VRegs,
-                    SmallVectorImpl<Register> &LeftoverVRegs);
-
-  /// Version which handles irregular sub-vector splits.
-  void extractVectorParts(Register Reg, unsigned NumElst,
-                          SmallVectorImpl<Register> &VRegs);
-
   /// Helper function to build a wide generic register \p DstReg of type \p
   /// RegTy from smaller parts. This will produce a G_MERGE_VALUES,
   /// G_BUILD_VECTOR, G_CONCAT_VECTORS, or sequence of G_INSERT as appropriate
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index ffb6e53a0363f9..617953dda9e90e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -33,6 +33,7 @@ class BlockFrequencyInfo;
 class GISelKnownBits;
 class MachineFunction;
 class MachineInstr;
+class MachineIRBuilder;
 class MachineOperand;
 class MachineOptimizationRemarkEmitter;
 class MachineOptimizationRemarkMissed;
@@ -247,6 +248,24 @@ MachineInstr *getDefIgnoringCopies(Register Reg,
 /// Also walks through hints such as G_ASSERT_ZEXT.
 Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI);
 
+/// Helper function to split a wide generic register into bitwise blocks with
+/// the given Type (which implies the number of blocks needed). The generic
+/// registers created are appended to Ops, starting at bit 0 of Reg.
+void extractParts(Register Reg, LLT Ty, int NumParts,
+                  SmallVectorImpl<Register> &VRegs,
+                  MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI);
+
+/// Version which handles irregular splits.
+bool extractParts(Register Reg, LLT RegTy, LLT MainTy, LLT &LeftoverTy,
+                  SmallVectorImpl<Register> &VRegs,
+                  SmallVectorImpl<Register> &LeftoverVRegs,
+                  MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI);
+
+/// Version which handles irregular sub-vector splits.
+void extractVectorParts(Register Reg, unsigned NumElst,
+                        SmallVectorImpl<Register> &VRegs,
+                        MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI);
+
 // Templated variant of getOpcodeDef returning a MachineInstr derived T.
 /// See if Reg is defined by an single def instruction of type T
 /// Also try to do trivial folding if it's a COPY with
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 21947a55874aa3..d6f240441b29b2 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -158,100 +158,6 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
   }
 }
 
-void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
-                                   SmallVectorImpl<Register> &VRegs) {
-  for (int i = 0; i < NumParts; ++i)
-    VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
-  MIRBuilder.buildUnmerge(VRegs, Reg);
-}
-
-bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
-                                   LLT MainTy, LLT &LeftoverTy,
-                                   SmallVectorImpl<Register> &VRegs,
-                                   SmallVectorImpl<Register> &LeftoverRegs) {
-  assert(!LeftoverTy.isValid() && "this is an out argument");
-
-  unsigned RegSize = RegTy.getSizeInBits();
-  unsigned MainSize = MainTy.getSizeInBits();
-  unsigned NumParts = RegSize / MainSize;
-  unsigned LeftoverSize = RegSize - NumParts * MainSize;
-
-  // Use an unmerge when possible.
-  if (LeftoverSize == 0) {
-    for (unsigned I = 0; I < NumParts; ++I)
-      VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
-    MIRBuilder.buildUnmerge(VRegs, Reg);
-    return true;
-  }
-
-  // Perform irregular split. Leftover is last element of RegPieces.
-  if (MainTy.isVector()) {
-    SmallVector<Register, 8> RegPieces;
-    extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
-    for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
-      VRegs.push_back(RegPieces[i]);
-    LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
-    LeftoverTy = MRI.getType(LeftoverRegs[0]);
-    return true;
-  }
-
-  LeftoverTy = LLT::scalar(LeftoverSize);
-  // For irregular sizes, extract the individual parts.
-  for (unsigned I = 0; I != NumParts; ++I) {
-    Register NewReg = MRI.createGenericVirtualRegister(MainTy);
-    VRegs.push_back(NewReg);
-    MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
-  }
-
-  for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
-       Offset += LeftoverSize) {
-    Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
-    LeftoverRegs.push_back(NewReg);
-    MIRBuilder.buildExtract(NewReg, Reg, Offset);
-  }
-
-  return true;
-}
-
-void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
-                                         SmallVectorImpl<Register> &VRegs) {
-  LLT RegTy = MRI.getType(Reg);
-  assert(RegTy.isVector() && "Expected a vector type");
-
-  LLT EltTy = RegTy.getElementType();
-  LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
-  unsigned RegNumElts = RegTy.getNumElements();
-  unsigned LeftoverNumElts = RegNumElts % NumElts;
-  unsigned NumNarrowTyPieces = RegNumElts / NumElts;
-
-  // Perfect split without leftover
-  if (LeftoverNumElts == 0)
-    return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
-
-  // Irregular split. Provide direct access to all elements for artifact
-  // combiner using unmerge to elements. Then build vectors with NumElts
-  // elements. Remaining element(s) will be (used to build vector) Leftover.
-  SmallVector<Register, 8> Elts;
-  extractParts(Reg, EltTy, RegNumElts, Elts);
-
-  unsigned Offset = 0;
-  // Requested sub-vectors of NarrowTy.
-  for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
-    ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
-    VRegs.push_back(MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
-  }
-
-  // Leftover element(s).
-  if (LeftoverNumElts == 1) {
-    VRegs.push_back(Elts[Offset]);
-  } else {
-    LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
-    ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
-    VRegs.push_back(
-        MIRBuilder.buildMergeLikeInstr(LeftoverTy, Pieces).getReg(0));
-  }
-}
-
 void LegalizerHelper::insertParts(Register DstReg,
                                   LLT ResultTy, LLT PartTy,
                                   ArrayRef<Register> PartRegs,
@@ -293,7 +199,8 @@ void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
                                        Register Reg) {
   LLT Ty = MRI.getType(Reg);
   SmallVector<Register, 8> RegElts;
-  extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
+  extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts,
+               MIRBuilder, MRI);
   Elts.append(RegElts);
 }
 
@@ -1542,7 +1449,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
-                   SrcRegs[i / 2]);
+                   SrcRegs[i / 2], MIRBuilder, MRI);
     }
     MachineBasicBlock &MBB = *MI.getParent();
     MIRBuilder.setInsertPt(MBB, MI);
@@ -1584,13 +1491,13 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
-                      LHSLeftoverRegs))
+                      LHSLeftoverRegs, MIRBuilder, MRI))
       return UnableToLegalize;
 
     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
-                      RHSPartRegs, RHSLeftoverRegs))
+                      RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI))
       return UnableToLegalize;
 
     // We now have the LHS and RHS of the compare split into narrow-type
@@ -1744,7 +1651,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     Observer.changingInstr(MI);
     SmallVector<Register, 2> SrcRegs, DstRegs;
     unsigned NumParts = SizeOp0 / NarrowSize;
-    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
+                 MIRBuilder, MRI);
 
     for (unsigned i = 0; i < NumParts; ++i) {
       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
@@ -4194,7 +4102,8 @@ LegalizerHelper::fewerElementsVectorMultiEltType(
                      MI.getOperand(UseIdx));
     } else {
       SmallVector<Register, 8> SplitPieces;
-      extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
+      extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder,
+                         MRI);
       for (auto Reg : SplitPieces)
         InputOpsPieces[UseNo].push_back(Reg);
     }
@@ -4250,7 +4159,8 @@ LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
        UseIdx += 2, ++UseNo) {
     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
-    extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
+    extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo],
+                       MIRBuilder, MRI);
   }
 
   // Build PHIs with fewer elements.
@@ -4519,7 +4429,7 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
   } else {
     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
-                     NarrowLeftoverRegs)) {
+                     NarrowLeftoverRegs, MIRBuilder, MRI)) {
       NumParts = NarrowRegs.size();
       NumLeftover = NarrowLeftoverRegs.size();
     }
@@ -4765,8 +4675,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
   unsigned NewElts = NarrowTy.getNumElements();
 
   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
-  extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
-  extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
+  extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI);
+  extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI);
   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
                         SplitSrc2Regs[1]};
 
@@ -4900,7 +4810,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
                           : SrcTy.getNumElements();
 
-  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
+  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
   if (NarrowTy.isScalar()) {
     if (DstTy != NarrowTy)
       return UnableToLegalize; // FIXME: handle implicit extensions.
@@ -5001,7 +4911,8 @@ LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
   SmallVector<Register> SplitSrcs;
   // Split the sources into NarrowTy size pieces.
   extractParts(SrcReg, NarrowTy,
-               SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
+               SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs,
+               MIRBuilder, MRI);
   // We're going to do a tree reduction using vector operations until we have
   // one NarrowTy size value left.
   while (SplitSrcs.size() > 1) {
@@ -5640,8 +5551,10 @@ LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
   LLT LeftoverTy, DummyTy;
   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
-  extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
-  extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
+  extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left,
+               MIRBuilder, MRI);
+  extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder,
+               MRI);
 
   int NarrowParts = Src1Regs.size();
   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
@@ -5699,8 +5612,8 @@ LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
 
   SmallVector<Register, 2> Src1Parts, Src2Parts;
   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
-  extractParts(Src1, NarrowTy, NumParts, Src1Parts);
-  extractParts(Src2, NarrowTy, NumParts, Src2Parts);
+  extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
+  extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
 
   // Take only high half of registers if this is high mul.
@@ -5752,7 +5665,8 @@ LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
 
   SmallVector<Register, 2> SrcRegs, DstRegs;
   SmallVector<uint64_t, 2> Indexes;
-  extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+  extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
+               MIRBuilder, MRI);
 
   Register OpReg = MI.getOperand(0).getReg();
   uint64_t OpStart = MI.getOperand(2).getImm();
@@ -5814,7 +5728,7 @@ LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
   LLT LeftoverTy;
   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
-               LeftoverRegs);
+               LeftoverRegs, MIRBuilder, MRI);
 
   for (Register Reg : LeftoverRegs)
     SrcRegs.push_back(Reg);
@@ -5899,12 +5813,12 @@ LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
   LLT LeftoverTy;
   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
-                    Src0Regs, Src0LeftoverRegs))
+                    Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI))
     return UnableToLegalize;
 
   LLT Unused;
   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
-                    Src1Regs, Src1LeftoverRegs))
+                    Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
     llvm_unreachable("inconsistent extractParts result");
 
   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
@@ -5967,12 +5881,12 @@ LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
   LLT LeftoverTy;
   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
-                    Src1Regs, Src1LeftoverRegs))
+                    Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
     return UnableToLegalize;
 
   LLT Unused;
   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
-                    Src2Regs, Src2LeftoverRegs))
+                    Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI))
     llvm_unreachable("inconsistent extractParts result");
 
   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
@@ -6468,7 +6382,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
 
     // First, split the source into two smaller vectors.
     SmallVector<Register, 2> SplitSrcs;
-    extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs);
+    extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI);
 
     // Truncate the splits into intermediate narrower elements.
     LLT InterTy;
@@ -7208,7 +7122,7 @@ LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
   int64_t IdxVal;
   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
     SmallVector<Register, 8> SrcRegs;
-    extractParts(SrcVec, EltTy, NumElts, SrcRegs);
+    extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI);
 
     if (InsertVal) {
       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index eaf829f562b2dc..2523fdd732eee2 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
@@ -476,6 +477,148 @@ Register llvm::getSrcRegIgnoringCopies(Register Reg,
   return DefSrcReg ? DefSrcReg->Reg : Register();
 }
 
+void llvm::extractParts(Register Reg, LLT Ty, int NumParts,
+                        SmallVectorImpl<Register> &VRegs,
+                        MachineIRBuilder &MIRBuilder,
+                        MachineRegisterInfo &MRI) {
+  for (int i = 0; i < NumParts; ++i)
+    VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
+  MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+bool llvm::extractParts(Register Reg, LLT RegTy, LLT MainTy, LLT &LeftoverTy,
+                        SmallVectorImpl<Register> &VRegs,
+                        SmallVectorImpl<Register> &LeftoverRegs,
+                        MachineIRBuilder &MIRBuilder,
+                        MachineRegisterInfo &MRI) {
+  assert(!LeftoverTy.isValid() && "this is an out argument");
+
+  unsigned RegSize = RegTy.getSizeInBits();
+  unsigned MainSize = MainTy.getSizeInBits();
+  unsigned NumParts = RegSize / MainSize;
+  unsigned LeftoverSize = RegSize - NumParts * MainSize;
+
+  // Use an unmerge when possible.
+  if (LeftoverSize == 0) {
+    for (unsigned I = 0; I < NumParts; ++I)
+      VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
+    MIRBuilder.buildUnmerge(VRegs, Reg);
+    return true;
+  }
+
+  // Try to use unmerge for irregular vector split where possible
+  if (RegTy.isVector() && MainTy.isVector()) {
+    unsigned RegNumElts = RegTy.getNumElements();
+    unsigned MainNumElts = MainTy.getNumElements();
+    unsigned LeftoverNumElts = RegNumElts % MainNumElts;
+    // If can unmerge to LeftoverTy, do it
+    if (MainNumElts % LeftoverNumElts == 0 &&
+        RegNumElts % LeftoverNumElts == 0 &&
+        RegTy.getScalarSizeInBits() == MainTy.getScalarSizeInBits() &&
+        LeftoverNumElts > 1) {
+      LeftoverTy =
+          LLT::fixed_vector(LeftoverNumElts, RegTy.getScalarSizeInBits());
+
+      // Unmerge the SrcReg to LeftoverTy vectors
+      SmallVector<Register, 4> UnmergeValues;
+      extractParts(Reg, LeftoverTy, RegNumElts / LeftoverNumElts, UnmergeValues,
+                   MIRBuilder, MRI);
+
+      // Find how many LeftoverTy makes one MainTy
+      unsigned LeftoverPerMain = MainNumElts / LeftoverNumElts;
+      unsigned NumOfLeftoverVal =
+          ((RegNumElts % MainNumElts) / LeftoverNumElts);
+
+      // Create as many MainTy as possible using unmerged value
+      SmallVector<Register, 4> MergeValues;
+      for (unsigned I = 0; I < UnmergeValues.size() - NumOfLeftoverVal; I++) {
+        MergeValues.push_back(UnmergeValues[I]);
+        if (MergeValues.size() == LeftoverPerMain) {
+          VRegs.push_back(
+              MIRBuilder.buildMergeLikeInstr(MainTy, MergeValues).getReg(0));
+          MergeValues.clear();
+        }
+      }
+      // Populate LeftoverRegs with the leftovers
+      for (unsigned I = UnmergeValues.size() - NumOfLeftoverVal;
+           I < UnmergeValues.size(); I++) {
+        LeftoverRegs.push_back(UnmergeValues[I]);
+      }
+      return true;
+    }
+  }
+  // Perform irregular split. Leftover is last element of RegPieces.
+  if (MainTy.isVector()) {
+    SmallVector<Register, 8> RegPieces;
+    extractVectorParts(Reg, MainTy.getNumElements(), RegPieces, MIRBuilder,
+                       MRI);
+    for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
+      VRegs.push_back(RegPieces[i]);
+    LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
+    LeftoverTy = MRI.getType(LeftoverRegs[0]);
+    return true;
+  }
+
+  LeftoverTy = LLT::scalar(LeftoverSize);
+  // For irregular sizes, extract the individual parts.
+  for (unsigned I = 0; I != NumParts; ++I) {
+    Register NewReg = MRI.createGenericVirtualRegister(MainTy);
+    VRegs.push_back(NewReg);
+    MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
+  }
+
+  for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
+       Offset += LeftoverSize) {
+    Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
+    LeftoverRegs.push_back(NewReg);
+    MIRBuilder.buildExtract(NewReg, Reg, Offset);
+  }
+
+  return true;
+}
+
+void llvm::extractVectorParts(Register Reg, unsigned NumElts,
+                              SmallVectorImpl<Register> &VRegs,
+                              MachineIRBuilder &MIRBuilder,
+                              MachineRegisterInfo &MRI) {
+  LLT RegTy = MRI.getType(Reg);
+  assert(RegTy.isVector() && "Expected a vector type");
+
+  LLT EltTy = RegTy.getElementType();
+  LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
+  unsigned RegNumElts = RegTy.getNumElements();
+  unsigned LeftoverNumElts = RegNumElts % NumElts;
+  unsigned NumNarrowTyPieces = RegNumElts / NumElts;
+
+  // Perfect split without leftover
+  if (LeftoverNumElts == 0)
+    return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs, MIRBuilder,
+                        MRI);
+
+  // Irregular split. Provide direct access to all elements for artifact
+  // combiner using unmerge to elements. Then build vectors with NumElts
+  // elements. Remaining element(s) will be (used to build vector) Leftover.
+  SmallVector<Register, 8> Elts;
+  extractParts(Reg, EltTy, RegNumElts, Elts, MIRBuilder, MRI);
+
+  unsigned Offset = 0;
+  // Requested sub-vectors of NarrowTy.
+  for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
+    ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
+    VRegs.push_back(MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
+  }
+
+  // Leftover element(s).
+  if (LeftoverNumElts == 1) {
+    VRegs.push_back(Elts[Offset]);
+  } else {
+    LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
+    ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
+    VRegs.push_back(
+        MIRBuilder.buildMergeLikeInstr(LeftoverTy, Pieces).getReg(0));
+  }
+}
+
 MachineInstr *llvm::getOpcodeDef(unsigned Opcode, Register Reg,
                                  const MachineRegisterInfo &MRI) {
   MachineInstr *DefMI = getDefIgnoringCopies(Reg, MRI);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 71632718857b90..e05d2a1880c1df 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -335,53 +336,45 @@ void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
     SmallVector<Register, 4> Ext1UnmergeReg;
     SmallVector<Register, 4> Ext2UnmergeReg;
     if (SrcTy.getNumElements() % 16 != 0) {
-      // Unmerge source to v8i8, append a new v8i8 of 0s and the merge to v16s
-      SmallVector<Register, 4> PadUnmergeDstReg1;
-      SmallVector<Register, 4> PadUnmergeDstReg2;
-      unsigned NumOfVec = SrcTy.getNumElements() / 8;
-
-      // Unmerge the source to v8i8
-      MachineInstr *PadUnmerge1 =
-          Builder.buildUnmerge(LLT::fixed_vector(8, 8), Ext1SrcReg);
-      MachineInstr *PadUnmerge2 =
-          Builder.buildUnmerge(LLT::fixed_vector(8, 8), Ext2SrcReg);
-      for (unsigned i = 0; i < NumOfVec; i++) {
-        PadUnmergeDstReg1.push_back(PadUnmerge1->getOperand(i).getReg());
-        PadUnmergeDstReg2.push_back(PadUnmerge2->getOperand(i).getReg());
+      SmallVector<Register, 1> Leftover1;
+      SmallVector<Register, 1> Leftover2;
+
+      // Split the elements into v16i8 and v8i8
+      LLT MainTy = LLT::fixed_vector(16, 8);
+      LLT LeftoverTy1, LeftoverTy2;
+      if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy,
+                         LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder,
+                         MRI)) ||
+          (!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy,
+                         LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder,
+                         MRI))) {
+        llvm_unreachable("Unable to split this vector properly");
       }
 
-      // Pad the vectors with a v8i8 constant of 0s
+      // Pad the leftover v8i8 vector with register of 0s of type v8i8
       MachineInstr *v8Zeroes =
           Builder.buildConstant(LLT::fixed_vector(8, 8), 0);
-      PadUnmergeDstReg1.push_back(v8Zeroes->getOperand(0).getReg());
-      PadUnmergeDstReg2.push_back(v8Zeroes->getOperand(0).getReg());
-
-      // Merge them all back to v16i8
-      NumOfVec = (NumOfVec + 1) / 2;
-      for (unsigned i = 0; i < NumOfVec; i++) {
-        Ext1UnmergeReg.push_back(
-            Builder
-                .buildMergeLikeInstr(
-                    LLT::fixed_vector(16, 8),
-                    {PadUnmergeDstReg1[i * 2], PadUnmergeDstReg1[(i * 2) + 1]})
-                .getReg(0));
-        Ext2UnmergeReg.push_back(
-            Builder
-                .buildMergeLikeInstr(
-                    LLT::fixed_vector(16, 8),
-                    {PadUnmergeDstReg2[i * 2], PadUnmergeDstReg2[(i * 2) + 1]})
-                .getReg(0));
-      }
+      Leftover1.push_back(v8Zeroes->getOperand(0).getReg());
+      Leftover2.push_back(v8Zeroes->getOperand(0).getReg());
+
+      Ext1UnmergeReg.push_back(
+          Builder
+              .buildMergeLikeInstr(LLT::fixed_vector(16, 8),
+                                   {Leftover1[0], Leftover1[1]})
+              .getReg(0));
+      Ext2UnmergeReg.push_back(
+          Builder
+              .buildMergeLikeInstr(LLT::fixed_vector(16, 8),
+                                   {Leftover2[0], Leftover2[1]})
+              .getReg(0));
+
     } else {
       // Unmerge the source vectors to v16i8
-      MachineInstr *Ext1Unmerge =
-          Builder.buildUnmerge(LLT::fixed_vector(16, 8), Ext1SrcReg);
-      MachineInstr *Ext2Unmerge =
-          Builder.buildUnmerge(LLT::fixed_vector(16, 8), Ext2SrcReg);
-      for (unsigned i = 0, e = SrcTy.getNumElements() / 16; i < e; i++) {
-        Ext1UnmergeReg.push_back(Ext1Unmerge->getOperand(i).getReg());
-        Ext2UnmergeReg.push_back(Ext2Unmerge->getOperand(i).getReg());
-      }
+      unsigned SrcNumElts = SrcTy.getNumElements();
+      extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
+                   Ext1UnmergeReg, Builder, MRI);
+      extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
+                   Ext2UnmergeReg, Builder, MRI);
     }
 
     // Build the UDOT instructions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 864275664882cc..07946388590e29 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -316,6 +316,29 @@ body:             |
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32)
     ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[DEF]](s32)
     ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<4 x s32>), [[BUILD_VECTOR1]], shufflemask(0, 1, 5, 6)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s32>), [[UV3:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<2 x s32>), [[UV5:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<2 x s32>), [[UV7:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<2 x s32>), [[UV9:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<2 x s32>), [[UV11:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV12:%[0-9]+]]:_(<2 x s32>), [[UV13:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[UV14:%[0-9]+]]:_(<2 x s32>), [[UV15:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[SHUF]](<4 x s32>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV]](<2 x s32>), [[UV3]](<2 x s32>)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV4]](<2 x s32>), [[UV7]](<2 x s32>)
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS1]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV8]](<2 x s32>), [[UV11]](<2 x s32>)
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS2]](<4 x s32>), [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV12]](<2 x s32>), [[UV15]](<2 x s32>)
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS3]](<4 x s32>), [[C3]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[EVEC]](s32), [[EVEC1]](s32), [[EVEC2]](s32), [[EVEC3]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR2]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
     %2:_(s32) = COPY $w2
@@ -437,12 +460,14 @@ body:             |
     ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[SHUF]](<4 x s32>), [[C3]](s64)
     ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[SHUF1]](<4 x s32>), [[C]](s64)
     ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[SHUF1]](<4 x s32>), [[C1]](s64)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[EVEC]](s32), [[EVEC1]](s32), [[EVEC2]](s32), [[EVEC3]](s32)
-    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[EVEC4]](s32), [[EVEC5]](s32)
-    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[COPY8]](p0) :: (store (<4 x s32>), align 32)
+    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[EVEC]](s32), [[EVEC1]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[EVEC2]](s32), [[EVEC3]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[EVEC4]](s32), [[EVEC5]](s32)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s32>), [[BUILD_VECTOR3]](<2 x s32>)
+    ; CHECK-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s32>), [[COPY8]](p0) :: (store (<4 x s32>), align 32)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY8]], [[C4]](s64)
-    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR3]](<2 x s32>), [[PTR_ADD]](p0) :: (store (<2 x s32>) into unknown-address + 16, align 16)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR4]](<2 x s32>), [[PTR_ADD]](p0) :: (store (<2 x s32>) into unknown-address + 16, align 16)
     ; CHECK-NEXT: RET_ReallyLR
     %3:_(s32) = COPY $s0
     %4:_(s32) = COPY $s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
index f20e481ccd4b78..31f28b50462b78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
@@ -6828,13 +6828,12 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<12 x s16>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s32>) = G_BITCAST [[DEF]](<12 x s16>)
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32)
-    ; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s32>), [[UV7:%[0-9]+]]:_(<2 x s32>), [[UV8:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; SI-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ; SI-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>), [[UV2:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
+    ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV]](<2 x s32>), [[UV1]](<2 x s32>)
+    ; SI-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; SI-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ; SI-NEXT: G_STORE [[UV2]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
     ;
     ; CI-LABEL: name: test_store_global_v12s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
@@ -6842,13 +6841,12 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(<12 x s16>) = G_IMPLICIT_DEF
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s32>) = G_BITCAST [[DEF]](<12 x s16>)
-    ; CI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32)
-    ; CI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s32>), [[UV7:%[0-9]+]]:_(<2 x s32>), [[UV8:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; CI-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ; CI-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>), [[UV2:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
+    ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV]](<2 x s32>), [[UV1]](<2 x s32>)
+    ; CI-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; CI-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ; CI-NEXT: G_STORE [[UV2]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
     ;
     ; VI-LABEL: name: test_store_global_v12s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
@@ -6856,13 +6854,12 @@ body: |
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(<12 x s16>) = G_IMPLICIT_DEF
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s32>) = G_BITCAST [[DEF]](<12 x s16>)
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32)
-    ; VI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s32>), [[UV7:%[0-9]+]]:_(<2 x s32>), [[UV8:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; VI-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>), [[UV2:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
+    ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV]](<2 x s32>), [[UV1]](<2 x s32>)
+    ; VI-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; VI-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ; VI-NEXT: G_STORE [[UV2]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
     ;
     ; GFX9-LABEL: name: test_store_global_v12s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
@@ -6870,13 +6867,12 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<12 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s32>) = G_BITCAST [[DEF]](<12 x s16>)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32)
-    ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s32>), [[UV7:%[0-9]+]]:_(<2 x s32>), [[UV8:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
-    ; GFX9-NEXT: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>), [[UV2:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[BITCAST]](<6 x s32>)
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV]](<2 x s32>), [[UV1]](<2 x s32>)
+    ; GFX9-NEXT: G_STORE [[CONCAT_VECTORS]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ; GFX9-NEXT: G_STORE [[UV2]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<12 x s16>) = G_IMPLICIT_DEF
     G_STORE %1, %0 :: (store (<12 x s16>), align 16, addrspace 1)

>From 91ad3175cf0961b6f75f09c271ef6b27cd2e611f Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 12 Dec 2023 17:06:05 +0000
Subject: [PATCH 2/5] [AArch64][GlobalISel] Pre-commit for Combine
 vecreduce(ext) to {U/S}ADDLV

---
 llvm/test/CodeGen/AArch64/vecreduce-add.ll | 6554 ++++++++------------
 1 file changed, 2484 insertions(+), 4070 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 5fa28f77dc285f..32f5bfc43d6e54 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-SD-BASE
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-SD-DOT
-; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-GI-BASE
-; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-GI-DOT
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-BASE
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
 
 ; CHECK-GI-BASE:        warning: Instruction selection used fallback path for full
 
@@ -51,33 +51,19 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i32_v4i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -85,33 +71,19 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i32_v4i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -145,33 +117,19 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w0, s0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -179,33 +137,19 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlv s0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w0, s0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i32_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -239,84 +183,47 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i16:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i16:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i16:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    uxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i16:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    uxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
 define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -324,53 +231,29 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -378,37 +261,21 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -416,37 +283,21 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -454,41 +305,23 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
-; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v2i16_v2i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -522,14 +355,14 @@ define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -543,6 +376,15 @@ define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -561,14 +403,14 @@ define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -582,6 +424,15 @@ define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -669,39 +520,22 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w0, s0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -724,37 +558,21 @@ entry:
 }
 
 define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    uxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    uxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i16_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -762,37 +580,21 @@ entry:
 }
 
 define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    sxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    sxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i16_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlp v0.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -800,35 +602,20 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    uxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    uxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i16_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -836,35 +623,20 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    sxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    sxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i16_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -872,120 +644,65 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i8:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i8:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    uxtb w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i8:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    uxtb w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxtb w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
 define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -993,89 +710,47 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1083,57 +758,31 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1141,57 +790,31 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1199,47 +822,26 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-BASE-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-DOT-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1247,57 +849,31 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    addp d0, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    addp d0, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-SD-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-NEXT:    addp d0, v1.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1305,41 +881,23 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
-; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x0, d0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x0, d0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v2i8_v2i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1386,37 +944,21 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1425,37 +967,21 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1494,37 +1020,21 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w0, w8, w0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w0, w8, w0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w0, w8, w0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1533,37 +1043,21 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w0, w8, w0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlv s0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w0, w8, w0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w0, w8, w0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1602,37 +1096,21 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i16_acc:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i16_acc:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i16_acc:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i16_acc:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i16_acc:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
@@ -1640,57 +1118,31 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1699,57 +1151,31 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1758,41 +1184,23 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1801,41 +1209,23 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1844,45 +1234,25 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1920,15 +1290,15 @@ define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ; CHECK-SD-BASE-NEXT:    add w0, w8, w0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -1943,6 +1313,16 @@ define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1963,19 +1343,19 @@ define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ; CHECK-SD-BASE-NEXT:    add w0, w8, w0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
 ; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
 ; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
@@ -1986,6 +1366,16 @@ define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -2084,43 +1474,24 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w0, w8, w0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w0, w8, w0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w0, w8, w0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -2146,41 +1517,23 @@ entry:
 }
 
 define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlv h0, v0.16b
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlv h0, v0.16b
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -2189,41 +1542,23 @@ entry:
 }
 
 define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlv h0, v0.16b
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    sxth w0, w8
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlv h0, v0.16b
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    sxth w0, w8
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-BASE-NEXT:    sxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-DOT-NEXT:    sxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv h0, v0.16b
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    sxth w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -2232,41 +1567,23 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -2275,41 +1592,23 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    sxth w0, w8
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    sxth w0, w8
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-BASE-NEXT:    sxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-DOT-NEXT:    sxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    sxth w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -2318,37 +1617,21 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    and w0, w8, #0xff
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    and w0, w8, #0xff
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxtb
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxtb
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -2356,93 +1639,49 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -2451,93 +1690,49 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -2546,61 +1741,33 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2609,61 +1776,33 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2672,51 +1811,28 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-BASE-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-DOT-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2725,61 +1841,33 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    addp d0, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    addp d0, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-SD-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-NEXT:    addp d0, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2788,45 +1876,25 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
-; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x8, d0
-; CHECK-SD-BASE-NEXT:    add x0, x8, x0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x8, d0
-; CHECK-SD-DOT-NEXT:    add x0, x8, x0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    add x0, x8, x0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    add x0, x8, x0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    add x0, x8, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2865,37 +1933,21 @@ entry:
 }
 
 define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i32:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i32:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i32:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i32:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i32_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
@@ -2904,47 +1956,26 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-SD-BASE-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-SD-DOT-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2955,47 +1986,26 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-SD-BASE-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-SD-DOT-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-SD-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -3006,41 +2016,23 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -3051,41 +2043,23 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -3096,47 +2070,26 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-SD-DOT-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-SD-DOT-NEXT:    addv s0, v1.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-NEXT:    addv s0, v1.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -3147,47 +2100,26 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-SD-BASE-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-SD-DOT-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-SD-DOT-NEXT:    addv s0, v1.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-SD-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-SD-NEXT:    addv s0, v1.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -3198,41 +2130,23 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -3243,68 +2157,68 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-SD-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %xx = sext <4 x i16> %x to <4 x i32>
+  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
+  %yy = sext <4 x i16> %y to <4 x i32>
+  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
+  %z = add i32 %z1, %z2
+  ret i32 %z
+}
+
+define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-BASE-LABEL: test_udot_v8i8:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v2.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-SD-DOT-LABEL: test_udot_v8i8:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-SD-DOT-NEXT:    fmov w0, s0
 ; CHECK-SD-DOT-NEXT:    ret
 ;
-; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI-BASE-LABEL: test_udot_v8i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
 ;
-; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI-DOT-LABEL: test_udot_v8i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
-entry:
-  %xx = sext <4 x i16> %x to <4 x i32>
-  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
-  %yy = sext <4 x i16> %y to <4 x i32>
-  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
-  %z = add i32 %z1, %z2
-  ret i32 %z
-}
-
-define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-BASE-LABEL: test_udot_v8i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
-; CHECK-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v2.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: test_udot_v8i8:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
 entry:
   %0 = zext <8 x i8> %a to <8 x i32>
   %1 = zext <8 x i8> %b to <8 x i32>
@@ -3329,13 +2243,13 @@ define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: test_udot_v16i8:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
+; CHECK-SD-DOT-LABEL: test_udot_v16i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v16i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -3351,6 +2265,14 @@ define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_udot_v16i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %0 = zext <16 x i8> %a to <16 x i32>
   %1 = zext <16 x i8> %b to <16 x i32>
@@ -3360,28 +2282,28 @@ entry:
 }
 
 define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
-; CHECK-BASE-LABEL: test_udot_v24i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ldr q0, [x0]
-; CHECK-BASE-NEXT:    ldr q1, [x1]
-; CHECK-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
-; CHECK-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-BASE-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-BASE-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
-; CHECK-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
+; CHECK-SD-BASE-LABEL: test_udot_v24i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ldr q0, [x0]
+; CHECK-SD-BASE-NEXT:    ldr q1, [x1]
+; CHECK-SD-BASE-NEXT:    ldr d4, [x0, #16]
+; CHECK-SD-BASE-NEXT:    ldr d5, [x1, #16]
+; CHECK-SD-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    ushll v3.8h, v4.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v4.8h, v5.8b, #0
+; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
+; CHECK-SD-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
+; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
 ;
 ; CHECK-SD-DOT-LABEL: test_udot_v24i8:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
@@ -3400,6 +2322,29 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    add w0, w9, w8
 ; CHECK-SD-DOT-NEXT:    ret
 ;
+; CHECK-GI-BASE-LABEL: test_udot_v24i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
+; CHECK-GI-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
+; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
@@ -3515,42 +2460,42 @@ entry:
 }
 
 define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
-; CHECK-BASE-LABEL: test_udot_v48i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
-; CHECK-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
-; CHECK-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
-; CHECK-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
-; CHECK-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    ushll v1.8h, v2.8b, #0
-; CHECK-BASE-NEXT:    ushll v2.8h, v7.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
-; CHECK-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
-; CHECK-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-BASE-NEXT:    ushll v1.8h, v3.8b, #0
-; CHECK-BASE-NEXT:    ushll v2.8h, v4.8b, #0
-; CHECK-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
-; CHECK-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
-; CHECK-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
+; CHECK-SD-BASE-LABEL: test_udot_v48i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ldp q0, q4, [x1]
+; CHECK-SD-BASE-NEXT:    ldr q2, [x0, #32]
+; CHECK-SD-BASE-NEXT:    ldp q1, q3, [x0]
+; CHECK-SD-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-SD-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
+; CHECK-SD-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
+; CHECK-SD-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v2.8h, v7.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
+; CHECK-SD-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
+; CHECK-SD-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
+; CHECK-SD-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v3.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-SD-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
+; CHECK-SD-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
+; CHECK-SD-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
 ;
 ; CHECK-SD-DOT-LABEL: test_udot_v48i8:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
@@ -3566,6 +2511,43 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    fmov w0, s0
 ; CHECK-SD-DOT-NEXT:    ret
 ;
+; CHECK-GI-BASE-LABEL: test_udot_v48i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v7.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
+; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
+; CHECK-GI-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
+; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
+; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+;
 ; CHECK-GI-DOT-LABEL: test_udot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
@@ -3780,23 +2762,41 @@ entry:
 }
 
 define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-BASE-LABEL: test_sdot_v8i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h
-; CHECK-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v2.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: test_sdot_v8i8:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
+; CHECK-SD-BASE-LABEL: test_sdot_v8i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v2.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_sdot_v8i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: test_sdot_v8i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_sdot_v8i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %0 = sext <8 x i8> %a to <8 x i32>
   %1 = sext <8 x i8> %b to <8 x i32>
@@ -3821,13 +2821,13 @@ define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: test_sdot_v16i8:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
+; CHECK-SD-DOT-LABEL: test_sdot_v16i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -3843,6 +2843,14 @@ define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_sdot_v16i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %0 = sext <16 x i8> %a to <16 x i32>
   %1 = sext <16 x i8> %b to <16 x i32>
@@ -3852,28 +2860,28 @@ entry:
 }
 
 define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
-; CHECK-BASE-LABEL: test_sdot_v24i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ldr q0, [x0]
-; CHECK-BASE-NEXT:    ldr q1, [x1]
-; CHECK-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
-; CHECK-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-BASE-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-BASE-NEXT:    sshll v4.8h, v5.8b, #0
-; CHECK-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
-; CHECK-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
-; CHECK-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
-; CHECK-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
+; CHECK-SD-BASE-LABEL: test_sdot_v24i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ldr q0, [x0]
+; CHECK-SD-BASE-NEXT:    ldr q1, [x1]
+; CHECK-SD-BASE-NEXT:    ldr d4, [x0, #16]
+; CHECK-SD-BASE-NEXT:    ldr d5, [x1, #16]
+; CHECK-SD-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    sshll v3.8h, v4.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v4.8h, v5.8b, #0
+; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
+; CHECK-SD-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
+; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
 ;
 ; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
@@ -3892,6 +2900,29 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    add w0, w9, w8
 ; CHECK-SD-DOT-NEXT:    ret
 ;
+; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
+; CHECK-GI-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
+; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+;
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
@@ -4007,46 +3038,46 @@ entry:
 }
 
 define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
-; CHECK-BASE-LABEL: test_sdot_v48i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
-; CHECK-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
-; CHECK-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
-; CHECK-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
-; CHECK-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    sshll v1.8h, v2.8b, #0
-; CHECK-BASE-NEXT:    sshll v2.8h, v7.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
-; CHECK-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
-; CHECK-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
-; CHECK-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-BASE-NEXT:    sshll v1.8h, v3.8b, #0
-; CHECK-BASE-NEXT:    sshll v2.8h, v4.8b, #0
-; CHECK-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
-; CHECK-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
-; CHECK-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-BASE-LABEL: test_sdot_v48i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ldp q0, q4, [x1]
+; CHECK-SD-BASE-NEXT:    ldr q2, [x0, #32]
+; CHECK-SD-BASE-NEXT:    ldp q1, q3, [x0]
+; CHECK-SD-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-SD-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
+; CHECK-SD-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
+; CHECK-SD-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v2.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v2.8h, v7.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
+; CHECK-SD-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
+; CHECK-SD-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
+; CHECK-SD-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v3.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v2.8h, v4.8b, #0
+; CHECK-SD-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
+; CHECK-SD-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
+; CHECK-SD-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32]
 ; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32]
 ; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
@@ -4058,6 +3089,43 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    fmov w0, s0
 ; CHECK-SD-DOT-NEXT:    ret
 ;
+; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-GI-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v7.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
+; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
+; CHECK-GI-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
+; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
+; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+;
 ; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
@@ -4273,18 +3341,18 @@ entry:
 
 ; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
 define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-BASE-LABEL: test_udot_v8i8_multi_use:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
-; CHECK-BASE-NEXT:    mov v3.16b, v2.16b
-; CHECK-BASE-NEXT:    fmov w8, s2
-; CHECK-BASE-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v3.4s
-; CHECK-BASE-NEXT:    fmov w9, s0
-; CHECK-BASE-NEXT:    add w0, w9, w8
-; CHECK-BASE-NEXT:    ret
+; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    mov v3.16b, v2.16b
+; CHECK-SD-BASE-NEXT:    fmov w8, s2
+; CHECK-SD-BASE-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v3.4s
+; CHECK-SD-BASE-NEXT:    fmov w9, s0
+; CHECK-SD-BASE-NEXT:    add w0, w9, w8
+; CHECK-SD-BASE-NEXT:    ret
 ;
 ; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
@@ -4299,18 +3367,18 @@ define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-DOT-NEXT:    add w0, w8, w9
 ; CHECK-SD-DOT-NEXT:    ret
 ;
-; CHECK-GI-DOT-LABEL: test_udot_v8i8_multi_use:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    umull v2.4s, v1.4h, v0.4h
-; CHECK-GI-DOT-NEXT:    mov v3.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    fmov w8, s2
-; CHECK-GI-DOT-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v3.4s
-; CHECK-GI-DOT-NEXT:    fmov w9, s0
-; CHECK-GI-DOT-NEXT:    add w0, w9, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-GI-LABEL: test_udot_v8i8_multi_use:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-GI-NEXT:    mov v3.16b, v2.16b
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
+; CHECK-GI-NEXT:    addv s0, v3.4s
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    add w0, w9, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = zext <8 x i8> %a to <8 x i32>
   %1 = zext <8 x i8> %b to <8 x i32>
@@ -4322,39 +3390,22 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i16:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i16:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i16:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i16:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i16_v8i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
@@ -4363,85 +3414,45 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -4452,85 +3463,45 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -4541,55 +3512,30 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-SD-BASE-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-SD-DOT-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -4600,55 +3546,30 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-SD-BASE-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-SD-DOT-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-SD-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -4659,53 +3580,29 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x0000000000ffff
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x0000000000ffff
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0x0000000000ffff
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -4716,59 +3613,32 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-SD-BASE-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-SD-BASE-NEXT:    ssra v0.2d, v1.2d, #48
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-SD-DOT-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-SD-DOT-NEXT:    ssra v0.2d, v1.2d, #48
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-GI-BASE-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #48
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-GI-DOT-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #48
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-SD-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #48
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -5041,51 +3911,28 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -5096,59 +3943,32 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-SD-BASE-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-SD-BASE-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-SD-BASE-NEXT:    ssra v0.4s, v1.4s, #24
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-SD-DOT-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-SD-DOT-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-SD-DOT-NEXT:    ssra v0.4s, v1.4s, #24
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-BASE-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-GI-BASE-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-BASE-NEXT:    sshr v1.4s, v1.4s, #24
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-DOT-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-GI-DOT-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-DOT-NEXT:    sshr v1.4s, v1.4s, #24
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w0, w8, w9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -5159,49 +3979,27 @@ entry:
 }
 
 define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-SD-BASE-NEXT:    uadalp v1.8h, v0.16b
-; CHECK-SD-BASE-NEXT:    addv h0, v1.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-SD-DOT-NEXT:    uadalp v1.8h, v0.16b
-; CHECK-SD-DOT-NEXT:    addv h0, v1.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    uadalp v1.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v1.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -5212,49 +4010,27 @@ entry:
 }
 
 define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddlp v1.8h, v1.16b
-; CHECK-SD-BASE-NEXT:    sadalp v1.8h, v0.16b
-; CHECK-SD-BASE-NEXT:    addv h0, v1.8h
-; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddlp v1.8h, v1.16b
-; CHECK-SD-DOT-NEXT:    sadalp v1.8h, v0.16b
-; CHECK-SD-DOT-NEXT:    addv h0, v1.8h
-; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-BASE-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-BASE-NEXT:    sxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-DOT-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-DOT-NEXT:    sxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlp v1.8h, v1.16b
+; CHECK-SD-NEXT:    sadalp v1.8h, v0.16b
+; CHECK-SD-NEXT:    addv h0, v1.8h
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -5265,43 +4041,24 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -5312,43 +4069,24 @@ entry:
 }
 
 define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
-; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
-; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
-; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-BASE-NEXT:    sxth w0, w8
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
-; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-DOT-NEXT:    sxth w0, w8
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -5359,39 +4097,22 @@ entry:
 }
 
 define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i8:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i8:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
-; CHECK-GI-BASE-NEXT:    addv b1, v1.16b
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxtb
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i8:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
-; CHECK-GI-DOT-NEXT:    addv b1, v1.16b
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxtb
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v16i8_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    addv b1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
@@ -5400,157 +4121,81 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    uaddl v2.2d, v5.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v4.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
-; CHECK-SD-BASE-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
-; CHECK-SD-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    uaddl v2.2d, v5.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v4.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
-; CHECK-SD-DOT-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
-; CHECK-SD-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v16.2d, v4.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v17.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v18.2d, v5.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v19.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v20.2d, v6.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v21.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v22.2d, v7.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v23.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-DOT-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v16.2d, v4.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v17.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v18.2d, v5.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v19.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v20.2d, v6.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v21.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v22.2d, v7.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v23.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-SD-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-SD-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-SD-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v16.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v18.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll v19.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v21.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v22.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v23.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -5561,157 +4206,81 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    saddl v2.2d, v5.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v4.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
-; CHECK-SD-BASE-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
-; CHECK-SD-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    saddl v2.2d, v5.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v4.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
-; CHECK-SD-DOT-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
-; CHECK-SD-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v16.2d, v4.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v17.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v18.2d, v5.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v19.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v20.2d, v6.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v21.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v22.2d, v7.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v23.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-DOT-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v16.2d, v4.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v17.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v18.2d, v5.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v19.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v20.2d, v6.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v21.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v22.2d, v7.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v23.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-SD-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-SD-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-SD-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-SD-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v18.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll v19.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll v21.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v22.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll v23.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -5722,93 +4291,49 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -5819,93 +4344,49 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-SD-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-DOT-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-DOT-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -5916,73 +4397,39 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-SD-BASE-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-SD-DOT-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-BASE-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    and v4.16b, v4.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-BASE-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-DOT-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-DOT-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    and v4.16b, v4.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-DOT-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-NEXT:    addp d0, v1.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -5993,93 +4440,49 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-BASE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-SD-BASE-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-SD-BASE-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-SD-BASE-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-SD-BASE-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    add v0.2d, v2.2d, v3.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-DOT-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-SD-DOT-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-SD-DOT-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-SD-DOT-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-SD-DOT-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    add v0.2d, v2.2d, v3.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-GI-BASE-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-GI-BASE-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-GI-BASE-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    addp d0, v2.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v3.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-GI-DOT-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-GI-DOT-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-GI-DOT-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    addp d0, v2.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v3.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-SD-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-SD-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-SD-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-SD-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-SD-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-GI-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-GI-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-GI-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-GI-NEXT:    addp d0, v2.2d
+; CHECK-GI-NEXT:    addp d1, v3.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -6090,53 +4493,29 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -6147,59 +4526,32 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-SD-BASE-NEXT:    ssra v0.2d, v1.2d, #56
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-SD-DOT-NEXT:    ssra v0.2d, v1.2d, #56
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-SD-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -6305,69 +4657,37 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
-; CHECK-SD-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-SD-BASE-NEXT:    uaddlp v3.4s, v3.8h
-; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-SD-BASE-NEXT:    uadalp v3.4s, v2.8h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-SD-DOT-NEXT:    uaddlp v3.4s, v3.8h
-; CHECK-SD-DOT-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-SD-DOT-NEXT:    uadalp v3.4s, v2.8h
-; CHECK-SD-DOT-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.4s, v3.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.4s, v5.4s, v1.4s
-; CHECK-GI-BASE-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-GI-BASE-NEXT:    add v3.4s, v7.4s, v3.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-BASE-NEXT:    add v1.4s, v2.4s, v3.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
-; CHECK-GI-BASE-NEXT:    ret
-;
-; CHECK-GI-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-DOT-NEXT:    ushll v7.4s, v3.4h, #0
-; CHECK-GI-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-GI-DOT-NEXT:    add v1.4s, v5.4s, v1.4s
-; CHECK-GI-DOT-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-GI-DOT-NEXT:    add v3.4s, v7.4s, v3.4s
-; CHECK-GI-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-DOT-NEXT:    add v1.4s, v2.4s, v3.4s
-; CHECK-GI-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w0, s0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-SD-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-SD-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-SD-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v7.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %axx = zext <8 x i16> %ax to <8 x i32>
   %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -6393,154 +4713,248 @@ entry:
 }
 
 define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-SD-BASE-LABEL: add_pair_v2i64_v2i64:
+; CHECK-SD-LABEL: add_pair_v2i64_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
+  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
+  %z = add i64 %z1, %z2
+  ret i64 %z
+}
+
+define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
+; CHECK-SD-BASE-LABEL: full:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
-; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ldr d0, [x2]
+; CHECK-SD-BASE-NEXT:    ldr d1, [x0]
+; CHECK-SD-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-SD-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-BASE-NEXT:    sxtw x8, w3
+; CHECK-SD-BASE-NEXT:    sxtw x9, w1
+; CHECK-SD-BASE-NEXT:    uabdl v0.8h, v1.8b, v0.8b
+; CHECK-SD-BASE-NEXT:    add x11, x2, x8
+; CHECK-SD-BASE-NEXT:    add x10, x0, x9
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
+; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    add x10, x10, x9
+; CHECK-SD-BASE-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
+; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    add x10, x10, x9
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
+; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    add x10, x10, x9
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
+; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    add x10, x10, x9
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
+; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    add x10, x10, x9
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11, x8]
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    ldr d1, [x10, x9]
+; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: add_pair_v2i64_v2i64:
+; CHECK-SD-DOT-LABEL: full:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
-; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ldr d0, [x0]
+; CHECK-SD-DOT-NEXT:    ldr d1, [x2]
+; CHECK-SD-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-SD-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-DOT-NEXT:    sxtw x8, w3
+; CHECK-SD-DOT-NEXT:    sxtw x9, w1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    add x11, x2, x8
+; CHECK-SD-DOT-NEXT:    add x10, x0, x9
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10, x9]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11, x8]
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
 ; CHECK-SD-DOT-NEXT:    ret
 ;
-; CHECK-GI-BASE-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI-BASE-LABEL: full:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
-; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
-; CHECK-GI-BASE-NEXT:    fmov x8, d0
-; CHECK-GI-BASE-NEXT:    fmov x9, d1
-; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ldr d0, [x2]
+; CHECK-GI-BASE-NEXT:    ldr d1, [x0]
+; CHECK-GI-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-GI-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-BASE-NEXT:    sxtw x8, w3
+; CHECK-GI-BASE-NEXT:    sxtw x9, w1
+; CHECK-GI-BASE-NEXT:    uabdl v0.8h, v1.8b, v0.8b
+; CHECK-GI-BASE-NEXT:    add x11, x2, x8
+; CHECK-GI-BASE-NEXT:    add x10, x0, x9
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11]
+; CHECK-GI-BASE-NEXT:    add x11, x11, x8
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10]
+; CHECK-GI-BASE-NEXT:    add x10, x10, x9
+; CHECK-GI-BASE-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11]
+; CHECK-GI-BASE-NEXT:    add x11, x11, x8
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10]
+; CHECK-GI-BASE-NEXT:    add x10, x10, x9
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11]
+; CHECK-GI-BASE-NEXT:    add x11, x11, x8
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10]
+; CHECK-GI-BASE-NEXT:    add x10, x10, x9
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11]
+; CHECK-GI-BASE-NEXT:    add x11, x11, x8
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10]
+; CHECK-GI-BASE-NEXT:    add x10, x10, x9
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11]
+; CHECK-GI-BASE-NEXT:    add x11, x11, x8
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10]
+; CHECK-GI-BASE-NEXT:    add x10, x10, x9
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11]
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10]
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    ldr d2, [x11, x8]
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    ldr d1, [x10, x9]
+; CHECK-GI-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-GI-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
 ;
-; CHECK-GI-DOT-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI-DOT-LABEL: full:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
-; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
-; CHECK-GI-DOT-NEXT:    fmov x8, d0
-; CHECK-GI-DOT-NEXT:    fmov x9, d1
-; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ldr d0, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d1, [x2]
+; CHECK-GI-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-GI-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-DOT-NEXT:    sxtw x8, w3
+; CHECK-GI-DOT-NEXT:    sxtw x9, w1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
+; CHECK-GI-DOT-NEXT:    add x11, x2, x8
+; CHECK-GI-DOT-NEXT:    add x10, x0, x9
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11]
+; CHECK-GI-DOT-NEXT:    add x11, x11, x8
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10]
+; CHECK-GI-DOT-NEXT:    add x10, x10, x9
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10]
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11]
+; CHECK-GI-DOT-NEXT:    add x10, x10, x9
+; CHECK-GI-DOT-NEXT:    add x11, x11, x8
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10]
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11]
+; CHECK-GI-DOT-NEXT:    add x10, x10, x9
+; CHECK-GI-DOT-NEXT:    add x11, x11, x8
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10]
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11]
+; CHECK-GI-DOT-NEXT:    add x10, x10, x9
+; CHECK-GI-DOT-NEXT:    add x11, x11, x8
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10]
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11]
+; CHECK-GI-DOT-NEXT:    add x10, x10, x9
+; CHECK-GI-DOT-NEXT:    add x11, x11, x8
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10]
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11]
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    ldr d1, [x10, x9]
+; CHECK-GI-DOT-NEXT:    ldr d4, [x11, x8]
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
-entry:
-  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
-  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
-  %z = add i64 %z1, %z2
-  ret i64 %z
-}
-
-define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
-; CHECK-BASE-LABEL: full:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ldr d0, [x2]
-; CHECK-BASE-NEXT:    ldr d1, [x0]
-; CHECK-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-BASE-NEXT:    sxtw x8, w3
-; CHECK-BASE-NEXT:    sxtw x9, w1
-; CHECK-BASE-NEXT:    uabdl v0.8h, v1.8b, v0.8b
-; CHECK-BASE-NEXT:    add x11, x2, x8
-; CHECK-BASE-NEXT:    add x10, x0, x9
-; CHECK-BASE-NEXT:    ldr d2, [x11]
-; CHECK-BASE-NEXT:    add x11, x11, x8
-; CHECK-BASE-NEXT:    ldr d1, [x10]
-; CHECK-BASE-NEXT:    add x10, x10, x9
-; CHECK-BASE-NEXT:    uaddlp v0.4s, v0.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    ldr d2, [x11]
-; CHECK-BASE-NEXT:    add x11, x11, x8
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x10]
-; CHECK-BASE-NEXT:    add x10, x10, x9
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    ldr d2, [x11]
-; CHECK-BASE-NEXT:    add x11, x11, x8
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x10]
-; CHECK-BASE-NEXT:    add x10, x10, x9
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    ldr d2, [x11]
-; CHECK-BASE-NEXT:    add x11, x11, x8
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x10]
-; CHECK-BASE-NEXT:    add x10, x10, x9
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    ldr d2, [x11]
-; CHECK-BASE-NEXT:    add x11, x11, x8
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x10]
-; CHECK-BASE-NEXT:    add x10, x10, x9
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    ldr d2, [x11]
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x10]
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    ldr d2, [x11, x8]
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x10, x9]
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: full:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ldr d0, [x0]
-; CHECK-DOT-NEXT:    ldr d1, [x2]
-; CHECK-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-DOT-NEXT:    sxtw x8, w3
-; CHECK-DOT-NEXT:    sxtw x9, w1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v3.8b, #1
-; CHECK-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    add x11, x2, x8
-; CHECK-DOT-NEXT:    add x10, x0, x9
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10, x9]
-; CHECK-DOT-NEXT:    ldr d4, [x11, x8]
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
 entry:
   %idx.ext8 = sext i32 %s2 to i64
   %idx.ext = sext i32 %s1 to i64

>From 0ed325ddbc60a17f042d1fc07842851f502a04cc Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Thu, 2 Nov 2023 09:41:43 +0000
Subject: [PATCH 3/5] [AArch64][GlobalISel] Combine vecreduce(ext) to
 {U/S}ADDLV

Combines vecreduce_add(ext) to uaddlv instructions
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |   11 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |    1 +
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |   15 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   25 +-
 .../GISel/AArch64PreLegalizerCombiner.cpp     |  152 ++
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 2102 ++++++++++-------
 6 files changed, 1417 insertions(+), 889 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 99f256b887821b..1daa7d5fe6a7a8 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -44,13 +44,22 @@ def ext_addv_to_udot_addv : GICombineRule<
 >;
 }
 
+def ext_uaddv_to_uaddlv_matchinfo : GIDefMatchData<"std::pair<Register, bool>">;
+def ext_uaddv_to_uaddlv : GICombineRule<
+  (defs root:$root, ext_uaddv_to_uaddlv_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_VECREDUCE_ADD):$root,
+         [{ return matchExtUaddvToUaddlv(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyExtUaddvToUaddlv(*${root}, MRI, B, Observer, ${matchinfo}); }])
+>;
+
 def AArch64PreLegalizerCombiner: GICombiner<
   "AArch64PreLegalizerCombinerImpl", [all_combines,
                                       fconstant_to_constant,
                                       icmp_redundant_trunc,
                                       fold_global_offset,
                                       shuffle_to_extract,
-                                      ext_addv_to_udot_addv]> {
+                                      ext_addv_to_udot_addv,
+                                      ext_uaddv_to_uaddlv]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6ddbcd41dcb769..1fd639b4f7ee8f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -248,6 +248,7 @@ enum NodeType : unsigned {
 
   // Unsigned sum Long across Vector
   UADDLV,
+  SADDLV,
 
   // Add Pairwise of two vectors
   ADDP,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 1c88456560d3d3..e53328d6553af3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -227,6 +227,18 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_UADDLV : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
+def G_SADDLV : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
 def G_UDOT : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
@@ -282,6 +294,9 @@ def : GINodeEquiv<G_BSP, AArch64bsp>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
+def : GINodeEquiv<G_SADDLV, AArch64saddlv>;
+def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
+
 def : GINodeEquiv<G_UDOT, AArch64udot>;
 def : GINodeEquiv<G_SDOT, AArch64sdot>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3f4875998fc004..0f0e0cd9231d0e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -799,6 +799,7 @@ def AArch64uminv    : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
 def AArch64smaxv    : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
 def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 def AArch64uaddlv   : SDNode<"AArch64ISD::UADDLV", SDT_AArch64uaddlp>;
+def AArch64saddlv   : SDNode<"AArch64ISD::SADDLV", SDT_AArch64uaddlp>;
 
 def AArch64uabd     : PatFrags<(ops node:$lhs, node:$rhs),
                                [(abdu node:$lhs, node:$rhs),
@@ -6680,17 +6681,25 @@ def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
 def : Pat<(v4i32 (AArch64uaddlv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$op), hsub))>;
 
-def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
-          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
+multiclass SIMDAcrossLaneLongReductionIntrinsic<string Opc, SDPatternOperator addlv> {
+  def : Pat<(v4i32 (addlv (v8i8 V64:$Rn))),
+            (v4i32 (SUBREG_TO_REG (i64 0), (!cast<Instruction>(Opc#"v8i8v") V64:$Rn), hsub))>;
 
-def : Pat<(v4i32 (AArch64uaddlv (v4i16 V64:$Rn))),
-          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv4i16v V64:$Rn), ssub))>;
+  def : Pat<(v4i32 (addlv (v4i16 V64:$Rn))),
+            (v4i32 (SUBREG_TO_REG (i64 0), (!cast<Instruction>(Opc#"v4i16v") V64:$Rn), ssub))>;
 
-def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
-          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
+  def : Pat<(v4i32 (addlv (v16i8 V128:$Rn))),
+            (v4i32 (SUBREG_TO_REG (i64 0), (!cast<Instruction>(Opc#"v16i8v") V128:$Rn), hsub))>;
 
-def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
-          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
+  def : Pat<(v4i32 (addlv (v8i16 V128:$Rn))),
+            (v4i32 (SUBREG_TO_REG (i64 0), (!cast<Instruction>(Opc#"v8i16v") V128:$Rn), ssub))>;
+
+  def : Pat<(v2i64 (addlv (v4i32 V128:$Rn))),
+            (v2i64 (SUBREG_TO_REG (i64 0), (!cast<Instruction>(Opc#"v4i32v") V128:$Rn), dsub))>;
+}
+
+defm : SIMDAcrossLaneLongReductionIntrinsic<"UADDLV", AArch64uaddlv>;
+defm : SIMDAcrossLaneLongReductionIntrinsic<"SADDLV", AArch64saddlv>;
 
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index e05d2a1880c1df..c21d195a9f4e54 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -411,6 +411,158 @@ void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
+// Ensure that the type coming from the extend instruction is the right size
+bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           std::pair<Register, bool> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
+         "Expected G_VECREDUCE_ADD Opcode");
+
+  // Check if the last instruction is an extend
+  MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  auto ExtOpc = ExtMI->getOpcode();
+
+  if (ExtOpc == TargetOpcode::G_ZEXT)
+    std::get<1>(MatchInfo) = 0;
+  else if (ExtOpc == TargetOpcode::G_SEXT)
+    std::get<1>(MatchInfo) = 1;
+  else
+    return false;
+
+  // Check if the source register is a valid type
+  Register ExtSrcReg = ExtMI->getOperand(1).getReg();
+  LLT ExtSrcTy = MRI.getType(ExtSrcReg);
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  if ((DstTy.getScalarSizeInBits() == 16 &&
+       ExtSrcTy.getNumElements() % 8 == 0) ||
+      (DstTy.getScalarSizeInBits() == 32 &&
+       ExtSrcTy.getNumElements() % 4 == 0) ||
+      (DstTy.getScalarSizeInBits() == 64 &&
+       ExtSrcTy.getNumElements() % 4 == 0)) {
+    std::get<0>(MatchInfo) = ExtSrcReg;
+    return true;
+  }
+  return false;
+}
+
+void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B, GISelChangeObserver &Observer,
+                           std::pair<Register, bool> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
+         "Expected G_VECREDUCE_ADD Opcode");
+
+  unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
+  Register SrcReg = std::get<0>(MatchInfo);
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(DstReg);
+
+  // If SrcTy has more elements than expected, split them into multiple
+  // insructions and sum the results
+  LLT MainTy;
+  SmallVector<Register, 1> WorkingRegisters;
+  unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
+  unsigned SrcNumElem = SrcTy.getNumElements();
+  if ((SrcScalSize == 8 && SrcNumElem > 16) ||
+      (SrcScalSize == 16 && SrcNumElem > 8) ||
+      (SrcScalSize == 32 && SrcNumElem > 4)) {
+
+    LLT LeftoverTy;
+    SmallVector<Register, 4> LeftoverRegs;
+    if (SrcScalSize == 8)
+      MainTy = LLT::fixed_vector(16, 8);
+    else if (SrcScalSize == 16)
+      MainTy = LLT::fixed_vector(8, 16);
+    else if (SrcScalSize == 32)
+      MainTy = LLT::fixed_vector(4, 32);
+    else
+      llvm_unreachable("Source's Scalar Size not supported");
+
+    // Extract the parts and put each extracted sources through U/SADDLV and put
+    // the values inside a small vec
+    extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters,
+                 LeftoverRegs, B, MRI);
+    for (unsigned I = 0; I < LeftoverRegs.size(); I++) {
+      WorkingRegisters.push_back(LeftoverRegs[I]);
+    }
+  } else {
+    WorkingRegisters.push_back(SrcReg);
+    MainTy = SrcTy;
+  }
+
+  unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
+  LLT MidScalarLLT = LLT::scalar(MidScalarSize);
+  Register zeroReg =
+      B.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
+  for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
+    // If the number of elements is too small to build an instruction, extend
+    // its size before applying addlv
+    LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]);
+    if ((WorkingRegTy.getScalarSizeInBits() == 8) &&
+        (WorkingRegTy.getNumElements() == 4)) {
+      WorkingRegisters[I] =
+          B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
+                                              : TargetOpcode::G_ZEXT,
+                       {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]})
+              ->getOperand(0)
+              .getReg();
+    }
+
+    // Generate the {U/S}ADDLV instruction, whose output is always double of the
+    // Src's Scalar size
+    LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32)
+                                      : LLT::fixed_vector(2, 64);
+    Register addlvReg = B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]})
+                            ->getOperand(0)
+                            .getReg();
+
+    // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
+    // v2i64 register.
+    //     i16, i32 results uses v4i32 registers
+    //     i64      results uses v2i64 registers
+    // Therefore we have to extract/truncate the the value to the right type
+    if (MidScalarSize == 32 || MidScalarSize == 64) {
+      WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
+                                         {MidScalarLLT}, {addlvReg, zeroReg})
+                                ->getOperand(0)
+                                .getReg();
+    } else {
+      Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
+                                         {LLT::scalar(32)}, {addlvReg, zeroReg})
+                                ->getOperand(0)
+                                .getReg();
+      WorkingRegisters[I] =
+          B.buildTrunc({MidScalarLLT}, {extractReg})->getOperand(0).getReg();
+    }
+  }
+
+  Register outReg;
+  if (WorkingRegisters.size() > 1) {
+    outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1])
+                 ->getOperand(0)
+                 .getReg();
+    for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
+      outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I])
+                   ->getOperand(0)
+                   .getReg();
+    }
+  } else {
+    outReg = WorkingRegisters[0];
+  }
+
+  if (DstTy.getScalarSizeInBits() > MidScalarSize) {
+    // Handle the scalar value if the DstTy's Scalar Size is more than double
+    // Src's ScalarType
+    B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
+                                        : TargetOpcode::G_ZEXT,
+                 {DstReg}, {outReg});
+  } else {
+    B.buildCopy(DstReg, outReg);
+  }
+
+  MI.eraseFromParent();
+}
+
 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
                         CombinerHelper &Helper, GISelChangeObserver &Observer) {
   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 32f5bfc43d6e54..0b43e3b695a396 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -51,19 +51,11 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
-; CHECK-SD-LABEL: add_v4i32_v4i64_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i32_v4i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv d0, v0.4s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -71,19 +63,11 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
-; CHECK-SD-LABEL: add_v4i32_v4i64_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i32_v4i64_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv d0, v0.4s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -117,19 +101,11 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
-; CHECK-SD-LABEL: add_v8i16_v8i32_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v8i16_v8i32_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -137,19 +113,11 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
-; CHECK-SD-LABEL: add_v8i16_v8i32_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddlv s0, v0.8h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v8i16_v8i32_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -157,12 +125,18 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
-; CHECK-LABEL: add_v4i16_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -170,12 +144,18 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
-; CHECK-LABEL: add_v4i16_v4i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i32_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -214,15 +194,8 @@ define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ;
 ; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    uaddlv s0, v0.8h
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
@@ -244,15 +217,8 @@ define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
 ;
 ; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    saddlv s0, v0.8h
+; CHECK-GI-NEXT:    smov x0, v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -270,11 +236,8 @@ define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
 ;
 ; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
@@ -292,11 +255,8 @@ define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
 ;
 ; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    smov x0, v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
@@ -366,15 +326,9 @@ define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext:
@@ -414,15 +368,9 @@ define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext:
@@ -458,11 +406,9 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
@@ -498,11 +444,9 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
@@ -530,11 +474,11 @@ define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
@@ -543,14 +487,23 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i32_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -567,11 +520,9 @@ define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v16i8_v16i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
@@ -589,9 +540,7 @@ define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v16i8_v16i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    sxth w0, w8
 ; CHECK-GI-NEXT:    ret
@@ -611,10 +560,9 @@ define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
@@ -632,8 +580,7 @@ define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    sxth w0, w8
 ; CHECK-GI-NEXT:    ret
@@ -683,25 +630,9 @@ define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and x0, x8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -731,25 +662,9 @@ define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth x0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -772,16 +687,9 @@ define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and x0, x8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -804,16 +712,9 @@ define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth x0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
@@ -832,15 +733,11 @@ define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and x0, x8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
@@ -864,15 +761,11 @@ define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-NEXT:    addp d0, v1.2d
-; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth x0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
@@ -944,21 +837,12 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
-; CHECK-SD-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddlv d0, v0.4s
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    add x0, x8, x0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv d0, v0.4s
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -967,21 +851,12 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
-; CHECK-SD-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddlv d0, v0.4s
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    add x0, x8, x0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv d0, v0.4s
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1020,21 +895,12 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
-; CHECK-SD-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w0, w8, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1043,36 +909,34 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
-; CHECK-SD-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv s0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w0, w8, w0
+; CHECK-NEXT:    ret
+entry:
+  %xx = sext <8 x i16> %x to <8 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
+  %r = add i32 %z, %a
+  ret i32 %r
+}
+
+define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
+; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w8, s0
 ; CHECK-SD-NEXT:    add w0, w8, w0
 ; CHECK-SD-NEXT:    ret
 ;
-; CHECK-GI-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w0
 ; CHECK-GI-NEXT:    ret
-entry:
-  %xx = sext <8 x i16> %x to <8 x i32>
-  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
-  %r = add i32 %z, %a
-  ret i32 %r
-}
-
-define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
-; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1081,13 +945,20 @@ entry:
 }
 
 define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
-; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1132,16 +1003,9 @@ define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    uaddlv s0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, uxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
@@ -1165,16 +1029,9 @@ define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    saddlv s0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, sxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -1194,12 +1051,9 @@ define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, uxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
@@ -1219,12 +1073,9 @@ define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, sxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
@@ -1302,16 +1153,9 @@ define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    add w0, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext:
@@ -1355,16 +1199,9 @@ define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    add w0, w0, w8, sxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext:
@@ -1404,12 +1241,9 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    add w0, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
@@ -1449,12 +1283,9 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    add w0, w0, w8, sxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
@@ -1485,12 +1316,11 @@ define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
 ;
 ; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    add w0, w0, w8, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
@@ -1500,15 +1330,24 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
-; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_v4i8_v4i32_acc_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w0, w8, sxth
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1517,23 +1356,13 @@ entry:
 }
 
 define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
-; CHECK-SD-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddlv h0, v0.16b
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w8, w8, w0
-; CHECK-SD-NEXT:    and w0, w8, #0xffff
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -1542,23 +1371,13 @@ entry:
 }
 
 define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
-; CHECK-SD-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    saddlv h0, v0.16b
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w8, w8, w0
-; CHECK-SD-NEXT:    sxth w0, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    saddlv h0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w0
+; CHECK-NEXT:    sxth w0, w8
+; CHECK-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -1578,10 +1397,9 @@ define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    add w8, w8, w0
 ; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1603,10 +1421,9 @@ define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    add w8, w8, w0
 ; CHECK-GI-NEXT:    sxth w0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1617,21 +1434,37 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addv b0, v0.16b
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    add w8, w8, w0
-; CHECK-SD-NEXT:    and w0, w8, #0xff
-; CHECK-SD-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv b0, v0.16b
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
-; CHECK-GI-NEXT:    and w0, w8, #0xff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -1661,26 +1494,9 @@ define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -1712,26 +1528,9 @@ define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -1756,17 +1555,9 @@ define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -1791,17 +1582,9 @@ define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
@@ -1822,16 +1605,11 @@ define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
@@ -1857,16 +1635,11 @@ define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
 ;
 ; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-NEXT:    addp d0, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add x0, x0, w8, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
@@ -1966,12 +1739,8 @@ define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    uaddlv d0, v0.4s
+; CHECK-GI-NEXT:    uaddlv d1, v1.4s
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    add x0, x8, x9
@@ -1996,12 +1765,8 @@ define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    saddlv d0, v0.4s
+; CHECK-GI-NEXT:    saddlv d1, v1.4s
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    add x0, x8, x9
@@ -2080,12 +1845,8 @@ define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    uaddlv s0, v0.8h
+; CHECK-GI-NEXT:    uaddlv s1, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
@@ -2110,12 +1871,8 @@ define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    saddlv s0, v0.8h
+; CHECK-GI-NEXT:    saddlv s1, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
@@ -2139,10 +1896,8 @@ define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    uaddlv s1, v1.4h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
@@ -2166,10 +1921,8 @@ define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    saddlv s1, v1.4h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
@@ -3433,25 +3186,11 @@ define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    uaddlv s1, v1.8h
+; CHECK-GI-NEXT:    uaddlv s0, v0.8h
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    add x0, x8, w9, uxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
@@ -3482,25 +3221,11 @@ define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    saddlv s1, v1.8h
+; CHECK-GI-NEXT:    saddlv s0, v0.8h
+; CHECK-GI-NEXT:    smov x8, v1.s[0]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    add x0, x8, w9, sxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
@@ -3524,17 +3249,11 @@ define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    uaddlv s1, v1.4h
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    add x0, x8, w9, uxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
@@ -3558,17 +3277,11 @@ define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    saddlv s1, v1.4h
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    smov x8, v1.s[0]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    add x0, x8, w9, sxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
@@ -3678,25 +3391,12 @@ define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v5.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v3.4s, v6.4s, v3.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v7.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s1
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9, uxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
@@ -3751,25 +3451,12 @@ define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v2.4s, v4.4s, v2.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v5.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v3.4s, v6.4s, v3.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v7.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-BASE-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    saddlv h1, v1.16b
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s1
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    sxth w8, w8
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9, sxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
@@ -3817,17 +3504,12 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s1
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9, uxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
@@ -3875,17 +3557,12 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s1
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    sxth w8, w8
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9, sxth
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
@@ -3922,16 +3599,15 @@ define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    uaddlv s1, v1.4h
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    add w0, w8, w9, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
@@ -3957,17 +3633,16 @@ define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    saddlv s1, v1.4h
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    add w0, w8, w9, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
@@ -3989,15 +3664,11 @@ define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4020,15 +3691,11 @@ define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    saddlv h1, v1.16b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    sxth w0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4050,13 +3717,11 @@ define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4078,13 +3743,11 @@ define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-NEXT:    saddlv h1, v1.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    sxth w0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4156,45 +3819,12 @@ define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v16.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v18.2d, v5.2s, #0
-; CHECK-GI-NEXT:    ushll v19.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ushll v21.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v22.2d, v7.2s, #0
-; CHECK-GI-NEXT:    ushll v23.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    and x8, x8, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, w9, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -4241,45 +3871,12 @@ define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v16.2d, v4.2s, #0
-; CHECK-GI-NEXT:    sshll v17.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v18.2d, v5.2s, #0
-; CHECK-GI-NEXT:    sshll v19.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT:    sshll v21.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v22.2d, v7.2s, #0
-; CHECK-GI-NEXT:    sshll v23.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    saddlv h1, v1.16b
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    sxth x8, w8
+; CHECK-GI-NEXT:    add x0, x8, w9, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
@@ -4312,27 +3909,12 @@ define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    and x8, x8, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, w9, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -4365,27 +3947,12 @@ define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    sxth x8, w8
+; CHECK-GI-NEXT:    add x0, x8, w9, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
@@ -4411,24 +3978,15 @@ define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    uaddlv s1, v1.4h
+; CHECK-GI-NEXT:    uaddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    and x8, x8, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, w9, uxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
@@ -4463,25 +4021,16 @@ define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-GI-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-GI-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-GI-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-GI-NEXT:    addp d0, v2.2d
-; CHECK-GI-NEXT:    addp d1, v3.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    saddlv s1, v1.4h
+; CHECK-GI-NEXT:    saddlv s0, v0.4h
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    sxth x8, w8
+; CHECK-GI-NEXT:    add x0, x8, w9, sxth
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
@@ -4593,29 +4142,19 @@ define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8
 ;
 ; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v6.4s, v2.4h, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.4s, v3.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v4.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v5.4s, v1.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v2.4s, v6.4s, v2.8h
-; CHECK-GI-BASE-NEXT:    saddw2 v3.4s, v7.4s, v3.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    saddlv h3, v3.8b
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-BASE-NEXT:    saddlv h2, v2.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s3
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
-; CHECK-GI-BASE-NEXT:    fmov w10, s2
-; CHECK-GI-BASE-NEXT:    fmov w11, s3
-; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w10, w11
-; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s2
+; CHECK-GI-BASE-NEXT:    sxth w8, w8
+; CHECK-GI-BASE-NEXT:    and w9, w9, #0xffff
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10, uxth
+; CHECK-GI-BASE-NEXT:    add w8, w8, w11, sxth
+; CHECK-GI-BASE-NEXT:    add w0, w9, w8
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
@@ -4735,6 +4274,806 @@ entry:
   ret i64 %z
 }
 
+; Irregularly sized vectors
+define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
+; CHECK-SD-LABEL: add_v24i8_v24i16_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ldr b1, [sp, #64]
+; CHECK-SD-NEXT:    add x8, sp, #72
+; CHECK-SD-NEXT:    ldr b2, [sp]
+; CHECK-SD-NEXT:    add x9, sp, #8
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #80
+; CHECK-SD-NEXT:    mov v0.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #88
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #24
+; CHECK-SD-NEXT:    mov v0.b[2], w2
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #96
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    mov v0.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #104
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    mov v0.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #48
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT:    mov v0.b[6], w6
+; CHECK-SD-NEXT:    mov v0.b[7], w7
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v2.8b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-NEXT:    ldr s16, [sp, #72]
+; CHECK-GI-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-NEXT:    ldr s17, [sp, #104]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    mov v0.s[1], v6.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v17.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-NEXT:    ldr s17, [sp, #112]
+; CHECK-GI-NEXT:    mov v4.s[2], w2
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-GI-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-NEXT:    mov v4.s[3], w3
+; CHECK-GI-NEXT:    mov v5.s[3], w7
+; CHECK-GI-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-GI-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %xx = zext <24 x i8> %x to <24 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
+  ret i16 %z
+}
+
+define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) {
+; CHECK-SD-LABEL: add_v32i8_v32i16_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v32i8_v32i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %xx = zext <32 x i8> %x to <32 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
+  ret i16 %z
+}
+
+define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
+; CHECK-SD-LABEL: add_v24i8_v24i16_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ldr b1, [sp, #64]
+; CHECK-SD-NEXT:    add x8, sp, #72
+; CHECK-SD-NEXT:    ldr b2, [sp]
+; CHECK-SD-NEXT:    add x9, sp, #8
+; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #80
+; CHECK-SD-NEXT:    mov v0.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #88
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #24
+; CHECK-SD-NEXT:    mov v0.b[2], w2
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #96
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    mov v0.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #104
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    mov v0.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #48
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT:    mov v0.b[6], w6
+; CHECK-SD-NEXT:    mov v0.b[7], w7
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    saddw v0.8h, v0.8h, v2.8b
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    ldr s0, [sp]
+; CHECK-GI-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-NEXT:    ldr s16, [sp, #72]
+; CHECK-GI-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-NEXT:    ldr s17, [sp, #104]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    mov v0.s[1], v6.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v17.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-NEXT:    ldr s17, [sp, #112]
+; CHECK-GI-NEXT:    mov v4.s[2], w2
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-GI-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-NEXT:    mov v4.s[3], w3
+; CHECK-GI-NEXT:    mov v5.s[3], w7
+; CHECK-GI-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-GI-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %xx = sext <24 x i8> %x to <24 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
+  ret i16 %z
+}
+
+define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) {
+; CHECK-SD-LABEL: add_v32i8_v32i16_sext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    addv h0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v32i8_v32i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    saddlv h1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %xx = sext <32 x i8> %x to <32 x i16>
+  %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
+  ret i16 %z
+}
+
+; Irregularly sized vectors and larger extends
+define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
+; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    fmov s0, w0
+; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #72
+; CHECK-SD-BASE-NEXT:    ldr b2, [sp]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #8
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #80
+; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #16
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #88
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #24
+; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #96
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #32
+; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #104
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #40
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #112
+; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #48
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #120
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #56
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v3.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddw2 v2.4s, v3.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    fmov s0, w0
+; CHECK-SD-DOT-NEXT:    mov x8, sp
+; CHECK-SD-DOT-NEXT:    ldr b1, [sp, #64]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #72
+; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[1], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #80
+; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    mov v0.b[1], w1
+; CHECK-SD-DOT-NEXT:    movi v5.8b, #1
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #88
+; CHECK-SD-DOT-NEXT:    mov v0.b[2], w2
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #96
+; CHECK-SD-DOT-NEXT:    mov v0.b[3], w3
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #104
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[5], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #112
+; CHECK-SD-DOT-NEXT:    mov v0.b[4], w4
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #120
+; CHECK-SD-DOT-NEXT:    mov v0.b[5], w5
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[7], [x9]
+; CHECK-SD-DOT-NEXT:    mov v0.b[6], w6
+; CHECK-SD-DOT-NEXT:    udot v4.2s, v1.8b, v5.8b
+; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7
+; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #8
+; CHECK-SD-DOT-NEXT:    fmov w9, s1
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #16
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #24
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[11], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #32
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[12], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #40
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[13], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #48
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[14], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #56
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[15], [x8]
+; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w9
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    fmov s4, w0
+; CHECK-GI-BASE-NEXT:    fmov s5, w4
+; CHECK-GI-BASE-NEXT:    ldr s0, [sp]
+; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-BASE-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-BASE-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #72]
+; CHECK-GI-BASE-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v4.s[1], w1
+; CHECK-GI-BASE-NEXT:    mov v5.s[1], w5
+; CHECK-GI-BASE-NEXT:    mov v0.s[1], v6.s[0]
+; CHECK-GI-BASE-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-BASE-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-BASE-NEXT:    mov v3.s[1], v17.s[0]
+; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v4.s[2], w2
+; CHECK-GI-BASE-NEXT:    mov v5.s[2], w6
+; CHECK-GI-BASE-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-BASE-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-BASE-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-GI-BASE-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v4.s[3], w3
+; CHECK-GI-BASE-NEXT:    mov v5.s[3], w7
+; CHECK-GI-BASE-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-BASE-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-GI-BASE-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-GI-BASE-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-BASE-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-BASE-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-BASE-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    fmov s4, w0
+; CHECK-GI-DOT-NEXT:    fmov s5, w4
+; CHECK-GI-DOT-NEXT:    ldr s0, [sp]
+; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-DOT-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-DOT-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #72]
+; CHECK-GI-DOT-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #104]
+; CHECK-GI-DOT-NEXT:    mov v4.s[1], w1
+; CHECK-GI-DOT-NEXT:    mov v5.s[1], w5
+; CHECK-GI-DOT-NEXT:    mov v0.s[1], v6.s[0]
+; CHECK-GI-DOT-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-DOT-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-DOT-NEXT:    mov v3.s[1], v17.s[0]
+; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #112]
+; CHECK-GI-DOT-NEXT:    mov v4.s[2], w2
+; CHECK-GI-DOT-NEXT:    mov v5.s[2], w6
+; CHECK-GI-DOT-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-DOT-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-DOT-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-GI-DOT-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-DOT-NEXT:    mov v4.s[3], w3
+; CHECK-GI-DOT-NEXT:    mov v5.s[3], w7
+; CHECK-GI-DOT-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-DOT-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-GI-DOT-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-GI-DOT-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-DOT-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-DOT-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
+; CHECK-GI-DOT-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    udot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
+entry:
+  %xx = zext <24 x i8> %x to <24 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) {
+; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v4.4s, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
+entry:
+  %xx = zext <32 x i8> %x to <32 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
+; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    fmov s0, w0
+; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #72
+; CHECK-SD-BASE-NEXT:    ldr b2, [sp]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #8
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #80
+; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #16
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #88
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #24
+; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #96
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #32
+; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #104
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #40
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #112
+; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #48
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #120
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #56
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v3.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v2.8b, #0
+; CHECK-SD-BASE-NEXT:    saddw2 v2.4s, v3.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    fmov s0, w0
+; CHECK-SD-DOT-NEXT:    mov x8, sp
+; CHECK-SD-DOT-NEXT:    ldr b1, [sp, #64]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #72
+; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[1], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #80
+; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    mov v0.b[1], w1
+; CHECK-SD-DOT-NEXT:    movi v5.8b, #1
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #88
+; CHECK-SD-DOT-NEXT:    mov v0.b[2], w2
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #96
+; CHECK-SD-DOT-NEXT:    mov v0.b[3], w3
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #104
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[5], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #112
+; CHECK-SD-DOT-NEXT:    mov v0.b[4], w4
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-DOT-NEXT:    add x9, sp, #120
+; CHECK-SD-DOT-NEXT:    mov v0.b[5], w5
+; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[7], [x9]
+; CHECK-SD-DOT-NEXT:    mov v0.b[6], w6
+; CHECK-SD-DOT-NEXT:    sdot v4.2s, v1.8b, v5.8b
+; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7
+; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #8
+; CHECK-SD-DOT-NEXT:    fmov w9, s1
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #16
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #24
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[11], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #32
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[12], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #40
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[13], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #48
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[14], [x8]
+; CHECK-SD-DOT-NEXT:    add x8, sp, #56
+; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[15], [x8]
+; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w9
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    fmov s4, w0
+; CHECK-GI-BASE-NEXT:    fmov s5, w4
+; CHECK-GI-BASE-NEXT:    ldr s0, [sp]
+; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-BASE-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-BASE-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #72]
+; CHECK-GI-BASE-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v4.s[1], w1
+; CHECK-GI-BASE-NEXT:    mov v5.s[1], w5
+; CHECK-GI-BASE-NEXT:    mov v0.s[1], v6.s[0]
+; CHECK-GI-BASE-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-BASE-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-BASE-NEXT:    mov v3.s[1], v17.s[0]
+; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v4.s[2], w2
+; CHECK-GI-BASE-NEXT:    mov v5.s[2], w6
+; CHECK-GI-BASE-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-BASE-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-BASE-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-GI-BASE-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v4.s[3], w3
+; CHECK-GI-BASE-NEXT:    mov v5.s[3], w7
+; CHECK-GI-BASE-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-BASE-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-GI-BASE-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-GI-BASE-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-BASE-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-BASE-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-BASE-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    fmov s4, w0
+; CHECK-GI-DOT-NEXT:    fmov s5, w4
+; CHECK-GI-DOT-NEXT:    ldr s0, [sp]
+; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-DOT-NEXT:    ldr s1, [sp, #32]
+; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #40]
+; CHECK-GI-DOT-NEXT:    ldr s2, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #72]
+; CHECK-GI-DOT-NEXT:    ldr s3, [sp, #96]
+; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #104]
+; CHECK-GI-DOT-NEXT:    mov v4.s[1], w1
+; CHECK-GI-DOT-NEXT:    mov v5.s[1], w5
+; CHECK-GI-DOT-NEXT:    mov v0.s[1], v6.s[0]
+; CHECK-GI-DOT-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-DOT-NEXT:    mov v2.s[1], v16.s[0]
+; CHECK-GI-DOT-NEXT:    mov v3.s[1], v17.s[0]
+; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #16]
+; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #80]
+; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #112]
+; CHECK-GI-DOT-NEXT:    mov v4.s[2], w2
+; CHECK-GI-DOT-NEXT:    mov v5.s[2], w6
+; CHECK-GI-DOT-NEXT:    mov v0.s[2], v6.s[0]
+; CHECK-GI-DOT-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-DOT-NEXT:    mov v2.s[2], v16.s[0]
+; CHECK-GI-DOT-NEXT:    mov v3.s[2], v17.s[0]
+; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #88]
+; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #120]
+; CHECK-GI-DOT-NEXT:    mov v4.s[3], w3
+; CHECK-GI-DOT-NEXT:    mov v5.s[3], w7
+; CHECK-GI-DOT-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-GI-DOT-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-GI-DOT-NEXT:    mov v2.s[3], v16.s[0]
+; CHECK-GI-DOT-NEXT:    mov v3.s[3], v17.s[0]
+; CHECK-GI-DOT-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-DOT-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
+; CHECK-GI-DOT-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    sdot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
+entry:
+  %xx = sext <24 x i8> %x to <24 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
+  ret i32 %z
+}
+
+define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) {
+; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    saddlv h1, v1.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v4.4s, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
+entry:
+  %xx = sext <32 x i8> %x to <32 x i32>
+  %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
+  ret i32 %z
+}
+
 define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-SD-BASE-LABEL: full:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
@@ -5107,6 +5446,8 @@ entry:
 }
 
 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v24i16(<24 x i16>)
 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
@@ -5115,6 +5456,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)

>From c760d22f33f97afefbae1a06788ea37042321b4b Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Fri, 22 Dec 2023 13:24:23 +0000
Subject: [PATCH 4/5] fixup! [AArch64][GlobalISel] Combine vecreduce(ext) to
 {U/S}ADDLV

---
 .../AArch64/GISel/AArch64PreLegalizerCombiner.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index c21d195a9f4e54..69a0dd599088de 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -434,11 +434,13 @@ bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
   LLT ExtSrcTy = MRI.getType(ExtSrcReg);
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
   if ((DstTy.getScalarSizeInBits() == 16 &&
-       ExtSrcTy.getNumElements() % 8 == 0) ||
+       ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
       (DstTy.getScalarSizeInBits() == 32 &&
-       ExtSrcTy.getNumElements() % 4 == 0) ||
+       ExtSrcTy.getNumElements() % 4 == 0 &&
+       ExtSrcTy.getNumElements() < 65536) ||
       (DstTy.getScalarSizeInBits() == 64 &&
-       ExtSrcTy.getNumElements() % 4 == 0)) {
+       ExtSrcTy.getNumElements() % 4 == 0 &&
+       ExtSrcTy.getNumElements() < 4294967296)) {
     std::get<0>(MatchInfo) = ExtSrcReg;
     return true;
   }
@@ -539,12 +541,9 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
   Register outReg;
   if (WorkingRegisters.size() > 1) {
     outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1])
-                 ->getOperand(0)
-                 .getReg();
+                 .getReg(0);
     for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
-      outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I])
-                   ->getOperand(0)
-                   .getReg();
+      outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0);
     }
   } else {
     outReg = WorkingRegisters[0];

>From 36a56d9a8c05fefcdf682638c23f0d7078a2b33d Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Thu, 11 Jan 2024 16:12:24 +0000
Subject: [PATCH 5/5] fixup! fixup! [AArch64][GlobalISel] Combine
 vecreduce(ext) to {U/S}ADDLV

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  1 +
 .../GISel/AArch64PreLegalizerCombiner.cpp     | 25 +++++++------------
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d6f240441b29b2..91d2497fdb7e20 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4893,7 +4893,7 @@ LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
 
   SmallVector<Register> SplitSrcs;
   unsigned NumParts = SrcTy.getNumElements();
-  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
+  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
   Register Acc = ScalarReg;
   for (unsigned i = 0; i < NumParts; i++)
     Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e2d07a09649680..b030a52b82e006 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2461,6 +2461,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::SADDV)
     MAKE_CASE(AArch64ISD::UADDV)
     MAKE_CASE(AArch64ISD::UADDLV)
+    MAKE_CASE(AArch64ISD::SADDLV)
     MAKE_CASE(AArch64ISD::SDOT)
     MAKE_CASE(AArch64ISD::UDOT)
     MAKE_CASE(AArch64ISD::SMINV)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 69a0dd599088de..434c08259800a8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -436,11 +436,9 @@ bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
   if ((DstTy.getScalarSizeInBits() == 16 &&
        ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
       (DstTy.getScalarSizeInBits() == 32 &&
-       ExtSrcTy.getNumElements() % 4 == 0 &&
-       ExtSrcTy.getNumElements() < 65536) ||
+       ExtSrcTy.getNumElements() % 4 == 0) ||
       (DstTy.getScalarSizeInBits() == 64 &&
-       ExtSrcTy.getNumElements() % 4 == 0 &&
-       ExtSrcTy.getNumElements() < 4294967296)) {
+       ExtSrcTy.getNumElements() % 4 == 0)) {
     std::get<0>(MatchInfo) = ExtSrcReg;
     return true;
   }
@@ -494,8 +492,7 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
 
   unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
   LLT MidScalarLLT = LLT::scalar(MidScalarSize);
-  Register zeroReg =
-      B.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
+  Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0);
   for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
     // If the number of elements is too small to build an instruction, extend
     // its size before applying addlv
@@ -506,17 +503,15 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
           B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
                                               : TargetOpcode::G_ZEXT,
                        {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]})
-              ->getOperand(0)
-              .getReg();
+              .getReg(0);
     }
 
     // Generate the {U/S}ADDLV instruction, whose output is always double of the
     // Src's Scalar size
     LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32)
                                       : LLT::fixed_vector(2, 64);
-    Register addlvReg = B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]})
-                            ->getOperand(0)
-                            .getReg();
+    Register addlvReg =
+        B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0);
 
     // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
     // v2i64 register.
@@ -526,15 +521,13 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
     if (MidScalarSize == 32 || MidScalarSize == 64) {
       WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
                                          {MidScalarLLT}, {addlvReg, zeroReg})
-                                ->getOperand(0)
-                                .getReg();
+                                .getReg(0);
     } else {
       Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
                                          {LLT::scalar(32)}, {addlvReg, zeroReg})
-                                ->getOperand(0)
-                                .getReg();
+                                .getReg(0);
       WorkingRegisters[I] =
-          B.buildTrunc({MidScalarLLT}, {extractReg})->getOperand(0).getReg();
+          B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0);
     }
   }
 



More information about the llvm-commits mailing list