[llvm] [AArch64][GlobalISel] More type support for G_VECREDUCE_ADD (PR #67433)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 27 03:39:13 PDT 2023


https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/67433

>From 6c1f7c208e8c9db35f234a2dd855697b64acb089 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 26 Sep 2023 10:18:59 +0100
Subject: [PATCH 1/2] [AArch64][GlobalISel] More type support for
 G_VECREDUCE_ADD

G_VECREDUCE_ADD is now able to have v4i16 and v8i8 vector types as source registers
---
 .../GISel/AArch64InstructionSelector.cpp      |  4 ++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  9 +++-
 llvm/test/CodeGen/AArch64/aarch64-addv.ll     | 42 +++++++++++++------
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 36 ++++++++++++++++
 4 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 60bb820585ac0a2..0bbdebb80590a10 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -3559,8 +3559,12 @@ bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
     unsigned Opc = 0;
     if (VecTy == LLT::fixed_vector(16, 8))
       Opc = AArch64::ADDVv16i8v;
+    else if (VecTy == LLT::fixed_vector(8, 8))
+      Opc = AArch64::ADDVv8i8v;
     else if (VecTy == LLT::fixed_vector(8, 16))
       Opc = AArch64::ADDVv8i16v;
+    else if (VecTy == LLT::fixed_vector(4, 16))
+      Opc = AArch64::ADDVv4i16v;
     else if (VecTy == LLT::fixed_vector(4, 32))
       Opc = AArch64::ADDVv4i32v;
     else if (VecTy == LLT::fixed_vector(2, 64))
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 8d3d94290b0e580..323b81f2175f3fb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -861,8 +861,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lower();
 
   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
-      .legalFor(
-          {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
+      .legalFor({{s8, v16s8},
+                 {s8, v8s8},
+                 {s16, v8s16},
+                 {s16, v4s16},
+                 {s32, v4s32},
+                 {s32, v2s32},
+                 {s64, v2s64}})
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .lower();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 6cab309d7c094c2..f1798ccb1e3bbaa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -144,12 +144,21 @@ define i32 @oversized_ADDV_512(ptr %arr)  {
 }
 
 define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
-; CHECK-LABEL: addv_combine_i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: addv_combine_i8:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add v0.8b, v0.8b, v1.8b
+; SDAG-NEXT:    addv b0, v0.8b
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: addv_combine_i8:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    addv b0, v0.8b
+; GISEL-NEXT:    addv b1, v1.8b
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    add w0, w9, w8, uxtb
+; GISEL-NEXT:    ret
 entry:
   %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
   %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2)
@@ -158,12 +167,21 @@ entry:
 }
 
 define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
-; CHECK-LABEL: addv_combine_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; SDAG-LABEL: addv_combine_i16:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add v0.4h, v0.4h, v1.4h
+; SDAG-NEXT:    addv h0, v0.4h
+; SDAG-NEXT:    fmov w0, s0
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: addv_combine_i16:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    addv h0, v0.4h
+; GISEL-NEXT:    addv h1, v1.4h
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    add w0, w9, w8, uxth
+; GISEL-NEXT:    ret
 entry:
   %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
   %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 039417784da0bba..4d2ec0ba7107ed9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2,6 +2,28 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
 
+define i32 @addv_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: addv_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
+  ret i32 %arg1
+}
+
+define i16 @addv_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: addv_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
+  ret i16 %arg1
+}
+
 define i32 @add_v4i32_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: add_v4i32_v4i32:
 ; CHECK:       // %bb.0: // %entry
@@ -13,6 +35,17 @@ entry:
   ret i32 %z
 }
 
+define i8 @addv_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: addv_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
+  ret i8 %arg1
+}
+
 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
 ; CHECK-LABEL: add_v4i32_v4i64_zext:
 ; CHECK:       // %bb.0: // %entry
@@ -2261,7 +2294,9 @@ entry:
 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
@@ -2269,3 +2304,4 @@ declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)

>From 5dec7077f04aa735504146c816e528232fd18ea9 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Tue, 26 Sep 2023 16:26:13 +0100
Subject: [PATCH 2/2] fixup! [AArch64][GlobalISel] More type support for
 G_VECREDUCE_ADD

---
 llvm/test/CodeGen/AArch64/vecreduce-add.ll | 3488 ++++++++++++++++----
 1 file changed, 2777 insertions(+), 711 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 4d2ec0ba7107ed9..f8209af8354bced 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1,6 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 -mattr=+dotprod %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:        warning: Instruction selection used fallback path for add_v16i8_v16i16_zext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_v16i8_v16i16_sext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_v16i8_v16i16_acc_zext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_v16i8_v16i16_acc_sext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_pair_v16i8_v16i16_zext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for add_pair_v16i8_v16i16_sext
+; CHECK-GI-NEXT:   warning: Instruction selection used fallback path for full
 
 define i32 @addv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: addv_v2i32:
@@ -47,11 +56,25 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
-; CHECK-LABEL: add_v4i32_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -59,11 +82,25 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
-; CHECK-LABEL: add_v4i32_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -97,11 +134,25 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -109,11 +160,25 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -147,27 +212,64 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i16:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i16:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
 define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -175,16 +277,40 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
-; CHECK-LABEL: add_v8i16_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -192,12 +318,28 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
-; CHECK-LABEL: add_v4i16_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -205,12 +347,28 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
-; CHECK-LABEL: add_v4i16_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -218,14 +376,33 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
-; CHECK-LABEL: add_v2i16_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i16_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i16_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -267,6 +444,19 @@ define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ; CHECK-DOT-NEXT:    addv s0, v2.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -293,6 +483,19 @@ define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ; CHECK-DOT-NEXT:    addv s0, v2.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -315,6 +518,15 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -337,6 +549,15 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -344,13 +565,31 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -399,12 +638,27 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i16_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -412,12 +666,27 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i16_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -425,35 +694,90 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
-; CHECK-LABEL: add_v16i8_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv b0, v0.16b
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv b0, v0.16b
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uxtb w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
 define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
-; CHECK-LABEL: add_v16i8_v16i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -461,24 +785,66 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
-; CHECK-LABEL: add_v16i8_v16i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -486,17 +852,43 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -504,17 +896,43 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
-; CHECK-LABEL: add_v8i8_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -522,13 +940,35 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -536,18 +976,44 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
-; CHECK-LABEL: add_v4i8_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-NEXT:    addp d0, v1.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -555,14 +1021,33 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
-; CHECK-LABEL: add_v2i8_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i8_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i8_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -609,12 +1094,28 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
-; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -623,12 +1124,28 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
-; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -667,12 +1184,28 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
-; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -681,12 +1214,28 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
-; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlv s0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -725,13 +1274,29 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
-; CHECK-LABEL: add_v8i16_v8i16_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    and w0, w8, #0xffff
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i16_acc:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i16_acc:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
@@ -739,17 +1304,43 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -758,17 +1349,43 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -777,13 +1394,31 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -792,13 +1427,31 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -807,15 +1460,36 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
-; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI53_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -862,6 +1536,20 @@ define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -891,6 +1579,20 @@ define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -916,6 +1618,16 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -941,6 +1653,16 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -949,14 +1671,34 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
-; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w0, w8, w0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w0, w8, w0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI59_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1012,14 +1754,32 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
-; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    and w0, w8, #0xffff
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1028,14 +1788,32 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
-; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    sxth w0, w8
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    sxth w0, w8
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    sxth w0, w8
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1044,13 +1822,29 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-LABEL: add_v16i8_v16i8_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    add w8, w8, w0
-; CHECK-NEXT:    and w0, w8, #0xff
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i8_acc:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    addv b0, v0.16b
+; CHECK-BASE-NEXT:    fmov w8, s0
+; CHECK-BASE-NEXT:    add w8, w8, w0
+; CHECK-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i8_acc:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    addv b0, v0.16b
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w8, w8, w0
+; CHECK-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -1058,25 +1852,69 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1085,25 +1923,69 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1112,18 +1994,46 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1132,18 +2042,46 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1152,14 +2090,38 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlv d0, v0.4s
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI70_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI70_0]
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1168,19 +2130,47 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-NEXT:    addp d0, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1189,15 +2179,36 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
-; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    add x0, x8, x0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x8, d0
+; CHECK-BASE-NEXT:    add x0, x8, x0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x8, d0
+; CHECK-DOT-NEXT:    add x0, x8, x0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI72_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI72_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    add x0, x8, x0
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1236,12 +2247,28 @@ entry:
 }
 
 define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: add_pair_v4i32_v4i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i32_v4i32:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i32_v4i32:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
@@ -1250,13 +2277,34 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1267,13 +2315,34 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1284,12 +2353,30 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1300,12 +2387,30 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1316,13 +2421,34 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-NEXT:    addv s0, v1.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v1.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-DOT-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-DOT-NEXT:    addv s0, v1.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1333,13 +2459,34 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-NEXT:    addv s0, v1.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v1.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-DOT-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-DOT-NEXT:    addv s0, v1.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1350,12 +2497,30 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1366,12 +2531,30 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1382,12 +2565,29 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i16:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i16:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
@@ -1396,22 +2596,62 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1422,22 +2662,62 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1448,15 +2728,40 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1467,15 +2772,40 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1486,15 +2816,40 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI89_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI89_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1505,17 +2860,44 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-NEXT:    ssra v0.2d, v1.2d, #48
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-BASE-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-BASE-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-DOT-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-DOT-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #48
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1552,6 +2934,29 @@ define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-DOT-NEXT:    addv s0, v3.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v5.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v3.4s, v6.4s, v3.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v7.4s, v1.8h
+; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1588,6 +2993,29 @@ define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-DOT-NEXT:    addv s0, v3.4s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v2.4s, v4.4s, v2.8h
+; CHECK-GI-NEXT:    saddw2 v0.4s, v5.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v3.4s, v6.4s, v3.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v7.4s, v1.8h
+; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1617,6 +3045,21 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1646,6 +3089,21 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1656,14 +3114,38 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI95_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI95_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1674,17 +3156,44 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-NEXT:    ssra v0.4s, v1.4s, #24
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-BASE-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-BASE-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-BASE-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-DOT-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-DOT-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-DOT-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1729,12 +3238,31 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1745,12 +3273,31 @@ entry:
 }
 
 define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    addv h0, v0.8h
+; CHECK-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    addv h0, v0.8h
+; CHECK-GI-NEXT:    addv h1, v1.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-NEXT:    sxth w0, w8
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1761,12 +3308,29 @@ entry:
 }
 
 define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: add_pair_v16i8_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-BASE-NEXT:    addv b0, v0.16b
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv b0, v0.16b
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    addv b1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w8, w9, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
@@ -1775,38 +3339,114 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
-; CHECK-NEXT:    uaddl v2.2d, v5.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v4.2s
-; CHECK-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
-; CHECK-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-BASE-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-BASE-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-BASE-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-DOT-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-DOT-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v16.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v18.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll v19.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v21.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v22.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v23.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1817,38 +3457,114 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
-; CHECK-NEXT:    saddl v2.2d, v5.2s, v2.2s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v4.2s
-; CHECK-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
-; CHECK-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-BASE-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-BASE-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-BASE-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-DOT-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-DOT-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v16.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v18.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll v19.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll v21.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v22.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll v23.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1859,24 +3575,68 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1887,24 +3647,68 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1915,17 +3719,52 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-NEXT:    addp d0, v1.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-BASE-NEXT:    addp d0, v1.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-DOT-NEXT:    addp d0, v1.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI106_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI106_0]
+; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1936,26 +3775,70 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-NEXT:    add v0.2d, v2.2d, v3.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-BASE-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-BASE-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-BASE-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-BASE-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-DOT-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-DOT-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-DOT-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-DOT-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-GI-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-GI-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-GI-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-GI-NEXT:    addp d0, v2.2d
+; CHECK-GI-NEXT:    addp d1, v3.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1966,15 +3849,40 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI108_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI108_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1985,17 +3893,44 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-NEXT:    ssra v0.2d, v1.2d, #56
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-BASE-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-DOT-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2034,6 +3969,33 @@ define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8
 ; CHECK-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v3.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v4.4s, v0.8h
+; CHECK-GI-NEXT:    uaddw2 v1.4s, v5.4s, v1.8h
+; CHECK-GI-NEXT:    saddw2 v2.4s, v6.4s, v2.8h
+; CHECK-GI-NEXT:    saddw2 v3.4s, v7.4s, v3.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    addv s2, v2.4s
+; CHECK-GI-NEXT:    addv s3, v3.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    add w8, w8, w9
+; CHECK-GI-NEXT:    add w9, w10, w11
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %axx = zext <8 x i8> %ax to <8 x i32>
   %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
@@ -2050,16 +4012,48 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
-; CHECK-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-NEXT:    uaddlp v3.4s, v3.8h
-; CHECK-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-NEXT:    uadalp v3.4s, v2.8h
-; CHECK-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-DOT-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-DOT-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-DOT-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-DOT-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-DOT-NEXT:    addv s0, v0.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v7.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %axx = zext <8 x i16> %ax to <8 x i32>
   %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2085,12 +4079,28 @@ entry:
 }
 
 define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-LABEL: add_pair_v2i64_v2i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-BASE-LABEL: add_pair_v2i64_v2i64:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-BASE-NEXT:    addp d0, v0.2d
+; CHECK-BASE-NEXT:    fmov x0, d0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-DOT-LABEL: add_pair_v2i64_v2i64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-DOT-NEXT:    addp d0, v0.2d
+; CHECK-DOT-NEXT:    fmov x0, d0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
@@ -2208,6 +4218,62 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
+;
+; CHECK-GI-LABEL: full:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x2]
+; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    sxtw x8, w3
+; CHECK-GI-NEXT:    sxtw x9, w1
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v3.8b, #1
+; CHECK-GI-NEXT:    uabd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    add x11, x2, x8
+; CHECK-GI-NEXT:    add x10, x0, x9
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10]
+; CHECK-GI-NEXT:    ldr d4, [x11]
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    ldr d1, [x10, x9]
+; CHECK-GI-NEXT:    ldr d4, [x11, x8]
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-GI-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %idx.ext8 = sext i32 %s2 to i64
   %idx.ext = sext i32 %s1 to i64



More information about the llvm-commits mailing list