[llvm] 140a094 - [AArch64][GlobalISel] More type support for G_VECREDUCE_ADD (#67433)

via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 28 03:47:31 PDT 2023


Author: chuongg3
Date: 2023-09-28T11:47:26+01:00
New Revision: 140a094f5fe6090c61934850ef372cc868e7eeaf

URL: https://github.com/llvm/llvm-project/commit/140a094f5fe6090c61934850ef372cc868e7eeaf
DIFF: https://github.com/llvm/llvm-project/commit/140a094f5fe6090c61934850ef372cc868e7eeaf.diff

LOG: [AArch64][GlobalISel] More type support for G_VECREDUCE_ADD (#67433)

G_VECREDUCE_ADD is now able to have v4i16 and v8i8 vector types as
source registers

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/aarch64-addv.ll
    llvm/test/CodeGen/AArch64/vecreduce-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index b7e71da9fca4257..e0837b689607cc2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -3574,8 +3574,12 @@ bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
     unsigned Opc = 0;
     if (VecTy == LLT::fixed_vector(16, 8))
       Opc = AArch64::ADDVv16i8v;
+    else if (VecTy == LLT::fixed_vector(8, 8))
+      Opc = AArch64::ADDVv8i8v;
     else if (VecTy == LLT::fixed_vector(8, 16))
       Opc = AArch64::ADDVv8i16v;
+    else if (VecTy == LLT::fixed_vector(4, 16))
+      Opc = AArch64::ADDVv4i16v;
     else if (VecTy == LLT::fixed_vector(4, 32))
       Opc = AArch64::ADDVv4i32v;
     else if (VecTy == LLT::fixed_vector(2, 64))

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 8d3d94290b0e580..323b81f2175f3fb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -861,8 +861,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lower();
 
   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
-      .legalFor(
-          {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
+      .legalFor({{s8, v16s8},
+                 {s8, v8s8},
+                 {s16, v8s16},
+                 {s16, v4s16},
+                 {s32, v4s32},
+                 {s32, v2s32},
+                 {s64, v2s64}})
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .lower();

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 6cab309d7c094c2..f1798ccb1e3bbaa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -144,12 +144,21 @@ define i32 @oversized_ADDV_512(ptr %arr)  {
 }
 
 define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
-; CHECK-LABEL: addv_combine_i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: addv_combine_i8:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add v0.8b, v0.8b, v1.8b
+; SDAG-NEXT:    addv b0, v0.8b
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: addv_combine_i8:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    addv b0, v0.8b
+; GISEL-NEXT:    addv b1, v1.8b
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    add w0, w9, w8, uxtb
+; GISEL-NEXT:    ret
 entry:
   %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
   %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2)
@@ -158,12 +167,21 @@ entry:
 }
 
 define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
-; CHECK-LABEL: addv_combine_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: addv_combine_i16:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add v0.4h, v0.4h, v1.4h
+; SDAG-NEXT:    addv h0, v0.4h
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: addv_combine_i16:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    addv h0, v0.4h
+; GISEL-NEXT:    addv h1, v1.4h
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    add w0, w9, w8, uxth
+; GISEL-NEXT:    ret
 entry:
   %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
   %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 039417784da0bba..f8209af8354bced 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1,6 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 -mattr=+dotprod %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:        warning: Instruction selection used fallback path for add_v16i8_v16i16_zext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_v16i8_v16i16_sext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_v16i8_v16i16_acc_zext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_v16i8_v16i16_acc_sext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_pair_v16i8_v16i16_zext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_pair_v16i8_v16i16_sext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for full
+
+define i32 @addv_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: addv_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
+  ret i32 %arg1
+}
+
+define i16 @addv_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: addv_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
+  ret i16 %arg1
+}
 
 define i32 @add_v4i32_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: add_v4i32_v4i32:
@@ -13,12 +44,37 @@ entry:
   ret i32 %z
 }
 
-define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
-; CHECK-LABEL: add_v4i32_v4i64_zext:
+define i8 @addv_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: addv_v8i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
+  ret i8 %arg1
+}
+
+define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
+; CHECK-BASE-LABEL: add_v4i32_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -26,11 +82,25 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
-; CHECK-LABEL: add_v4i32_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -64,11 +134,25 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -76,11 +160,25 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -114,27 +212,64 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i16:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i16:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
 define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -142,16 +277,40 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -159,12 +318,28 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
-; CHECK-LABEL: add_v4i16_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -172,12 +347,28 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
-; CHECK-LABEL: add_v4i16_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -185,14 +376,33 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
-; CHECK-LABEL: add_v2i16_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i16_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i16_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -234,6 +444,19 @@ define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ; CHECK-DOT-NEXT:    addv s0, v2.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -260,6 +483,19 @@ define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ; CHECK-DOT-NEXT:    addv s0, v2.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -282,6 +518,15 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -304,6 +549,15 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -311,13 +565,31 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -366,12 +638,27 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i16_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -379,12 +666,27 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i16_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -392,35 +694,90 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
-; CHECK-LABEL: add_v16i8_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv b0, v0.16b
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv b0, v0.16b
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxtb w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
 define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
-; CHECK-LABEL: add_v16i8_v16i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -428,24 +785,66 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
-; CHECK-LABEL: add_v16i8_v16i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -453,17 +852,43 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -471,17 +896,43 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -489,13 +940,35 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -503,18 +976,44 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-NEXT:    addp d0, v1.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -522,17 +1021,36 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
-; CHECK-LABEL: add_v2i8_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
-entry:
-  %xx = zext <2 x i8> %x to <2 x i64>
-  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
+; CHECK-BASE-LABEL: add_v2i8_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i8_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %xx = zext <2 x i8> %x to <2 x i64>
+  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
   ret i64 %z
 }
 
@@ -576,12 +1094,28 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
-; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -590,12 +1124,28 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
-; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -634,12 +1184,28 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
-; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -648,12 +1214,28 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
-; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -692,13 +1274,29 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
-; CHECK-LABEL: add_v8i16_v8i16_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    and w0, w8, #0xffff
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i16_acc:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i16_acc:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
@@ -706,17 +1304,43 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -725,17 +1349,43 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -744,13 +1394,31 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -759,13 +1427,31 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -774,15 +1460,36 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI53_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -829,6 +1536,20 @@ define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -858,6 +1579,20 @@ define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -883,6 +1618,16 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -908,6 +1653,16 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -916,14 +1671,34 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
-; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI59_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -979,14 +1754,32 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
-; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    and w0, w8, #0xffff
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -995,14 +1788,32 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
-; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    sxth w0, w8
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    sxth w0, w8
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    sxth w0, w8
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1011,13 +1822,29 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-LABEL: add_v16i8_v16i8_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i8_acc:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv b0, v0.16b
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i8_acc:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv b0, v0.16b
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -1025,25 +1852,69 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1052,25 +1923,69 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1079,18 +1994,46 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1099,18 +2042,46 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1119,14 +2090,38 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI70_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI70_0]
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1135,19 +2130,47 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-NEXT:    addp d0, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1156,15 +2179,36 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI72_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI72_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1203,12 +2247,28 @@ entry:
 }
 
 define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: add_pair_v4i32_v4i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i32_v4i32:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i32_v4i32:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
@@ -1217,13 +2277,34 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1234,13 +2315,34 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1251,12 +2353,30 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1267,12 +2387,30 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1283,13 +2421,34 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-NEXT:    addv s0, v1.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v1.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-DOT-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-DOT-NEXT:    addv s0, v1.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1300,13 +2459,34 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-NEXT:    addv s0, v1.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v1.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-DOT-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-DOT-NEXT:    addv s0, v1.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1317,12 +2497,30 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1333,12 +2531,30 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1349,12 +2565,29 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i16:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i16:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
@@ -1363,22 +2596,62 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1389,22 +2662,62 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1415,15 +2728,40 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1434,15 +2772,40 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1453,15 +2816,40 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI89_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI89_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1472,17 +2860,44 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-NEXT:    ssra v0.2d, v1.2d, #48
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-BASE-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-BASE-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-DOT-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-DOT-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #48
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1519,6 +2934,29 @@ define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-DOT-NEXT:    addv s0, v3.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v5.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v3.4s, v6.4s, v3.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v7.4s, v1.8h
+; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1555,6 +2993,29 @@ define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-DOT-NEXT:    addv s0, v3.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v2.4s, v4.4s, v2.8h
+; CHECK-GI-NEXT:    saddw2 v0.4s, v5.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v3.4s, v6.4s, v3.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v7.4s, v1.8h
+; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1584,6 +3045,21 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1613,6 +3089,21 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1623,14 +3114,38 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI95_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI95_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1641,17 +3156,44 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-NEXT:    ssra v0.4s, v1.4s, #24
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-BASE-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-BASE-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-BASE-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-DOT-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-DOT-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-DOT-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1696,12 +3238,31 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1712,12 +3273,31 @@ entry:
 }
 
 define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1728,12 +3308,29 @@ entry:
 }
 
 define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: add_pair_v16i8_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-BASE-NEXT:    addv b0, v0.16b
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv b0, v0.16b
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    addv b1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
@@ -1742,38 +3339,114 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v5.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v4.2s
-; CHECK-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
-; CHECK-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-BASE-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-BASE-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-DOT-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-DOT-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v16.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v18.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll v19.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v21.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v22.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v23.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1784,38 +3457,114 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v5.2s, v2.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v4.2s
-; CHECK-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
-; CHECK-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-BASE-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-BASE-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-DOT-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-DOT-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v18.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll v19.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll v21.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v22.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll v23.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1826,24 +3575,68 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1854,24 +3647,68 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1882,17 +3719,52 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI106_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI106_0]
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1903,26 +3775,70 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-NEXT:    add v0.2d, v2.2d, v3.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-BASE-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-BASE-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-BASE-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-BASE-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-DOT-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-DOT-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-DOT-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-DOT-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-GI-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-GI-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-GI-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-GI-NEXT:    addp d0, v2.2d
+; CHECK-GI-NEXT:    addp d1, v3.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1933,15 +3849,40 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI108_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI108_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1952,17 +3893,44 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-NEXT:    ssra v0.2d, v1.2d, #56
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2001,6 +3969,33 @@ define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8
 ; CHECK-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v3.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v4.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v5.4s, v1.8h
+; CHECK-GI-NEXT:    saddw2 v2.4s, v6.4s, v2.8h
+; CHECK-GI-NEXT:    saddw2 v3.4s, v7.4s, v3.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    addv s3, v3.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    add w8, w8, w9
+; CHECK-GI-NEXT:    add w9, w10, w11
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %axx = zext <8 x i8> %ax to <8 x i32>
   %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
@@ -2017,16 +4012,48 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
-; CHECK-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-NEXT:    uaddlp v3.4s, v3.8h
-; CHECK-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-NEXT:    uadalp v3.4s, v2.8h
-; CHECK-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-DOT-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-DOT-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-DOT-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-DOT-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v7.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %axx = zext <8 x i16> %ax to <8 x i32>
   %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2052,12 +4079,28 @@ entry:
 }
 
 define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: add_pair_v2i64_v2i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i64_v2i64:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i64_v2i64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
@@ -2175,6 +4218,62 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: full:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x2]
+; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    sxtw x8, w3
+; CHECK-GI-NEXT:    sxtw x9, w1
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v3.8b, #1
+; CHECK-GI-NEXT:    uabd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    add x11, x2, x8
+; CHECK-GI-NEXT:    add x10, x0, x9
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10, x9]
+; CHECK-GI-NEXT:    ldr d4, [x11, x8]
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %idx.ext8 = sext i32 %s2 to i64
   %idx.ext = sext i32 %s1 to i64
@@ -2261,7 +4360,9 @@ entry:
 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
@@ -2269,3 +4370,4 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)


        


More information about the llvm-commits mailing list