[llvm] [AArch64][GlobalISel] Support udot lowering for vecreduce add (PR #70784)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 10 01:43:24 PST 2023


https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/70784

>From 2ce451004fd70ab63f771a13131c83d208379b2a Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Wed, 1 Nov 2023 11:40:08 +0000
Subject: [PATCH 1/2] [AArch64][GlobalISel] Pre-Commit for UDOT lowering for
 G_VECREDUCE_ADD

---
 llvm/test/CodeGen/AArch64/vecreduce-add.ll | 6805 ++++++++++----------
 1 file changed, 3579 insertions(+), 3226 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index a88c930d09e9b17..7dd275242425963 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
-; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 -mattr=+dotprod %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-SD-BASE
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD-DOT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-GI-BASE
 
-; CHECK-GI:        warning: Instruction selection used fallback path for full
+; CHECK-GI-BASE:        warning: Instruction selection used fallback path for full
 
 define i32 @addv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: addv_v2i32:
@@ -50,25 +50,25 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
-; CHECK-BASE-LABEL: add_v4i32_v4i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i32_v4i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -76,25 +76,25 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
-; CHECK-BASE-LABEL: add_v4i32_v4i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i32_v4i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -128,25 +128,25 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
-; CHECK-BASE-LABEL: add_v8i16_v8i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlv s0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -154,25 +154,25 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
-; CHECK-BASE-LABEL: add_v8i16_v8i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlv s0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -206,64 +206,64 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
-; CHECK-BASE-LABEL: add_v8i16_v8i16:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i16:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    uxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i16:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i16:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i16:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    uxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
 }
 
 define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
-; CHECK-BASE-LABEL: add_v8i16_v8i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -271,40 +271,40 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
-; CHECK-BASE-LABEL: add_v8i16_v8i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -312,28 +312,28 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
-; CHECK-BASE-LABEL: add_v4i16_v4i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i16_v4i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -341,28 +341,28 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
-; CHECK-BASE-LABEL: add_v4i16_v4i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i16_v4i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -370,32 +370,32 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
-; CHECK-BASE-LABEL: add_v2i16_v2i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v2i16_v2i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -418,38 +418,38 @@ entry:
 }
 
 define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -457,38 +457,38 @@ entry:
 }
 
 define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -496,30 +496,30 @@ entry:
 }
 
 define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
-; CHECK-BASE-LABEL: add_v8i8_v8i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
+; CHECK-SD-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -527,30 +527,30 @@ entry:
 }
 
 define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
-; CHECK-BASE-LABEL: add_v8i8_v8i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    saddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
+; CHECK-SD-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -558,30 +558,30 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
-; CHECK-BASE-LABEL: add_v4i8_v4i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i8_v4i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -604,28 +604,28 @@ entry:
 }
 
 define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i16_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i16_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlp v0.8h, v0.16b
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i16_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    uxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    uxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -633,28 +633,28 @@ entry:
 }
 
 define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i16_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i16_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i16_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlp v0.8h, v0.16b
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlp v0.8h, v0.16b
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -662,27 +662,27 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
-; CHECK-BASE-LABEL: add_v8i8_v8i16_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i16_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    uxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    uxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -690,27 +690,27 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
-; CHECK-BASE-LABEL: add_v8i8_v8i16_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i16_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -718,90 +718,90 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    addv b0, v0.16b
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i8:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    addv b0, v0.16b
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv b0, v0.16b
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    uxtb w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    uxtb w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
 }
 
 define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -809,66 +809,66 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
-; CHECK-BASE-LABEL: add_v16i8_v16i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -876,43 +876,43 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
-; CHECK-BASE-LABEL: add_v8i8_v8i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -920,43 +920,43 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
-; CHECK-BASE-LABEL: add_v8i8_v8i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -964,34 +964,34 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
-; CHECK-BASE-LABEL: add_v4i8_v4i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i8_v4i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-BASE-NEXT:    and v2.16b, v2.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -999,44 +999,44 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
-; CHECK-BASE-LABEL: add_v4i8_v4i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i8_v4i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-NEXT:    addp d0, v1.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    addp d0, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1044,32 +1044,32 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
-; CHECK-BASE-LABEL: add_v2i8_v2i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v2i8_v2i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x0, d0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1116,28 +1116,28 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1146,28 +1146,28 @@ entry:
 }
 
 define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i32_v4i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1206,28 +1206,28 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlv s0, v0.8h
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1236,28 +1236,28 @@ entry:
 }
 
 define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlv s0, v0.8h
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i32_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1296,29 +1296,29 @@ entry:
 }
 
 define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
-; CHECK-BASE-LABEL: add_v8i16_v8i16_acc:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w8, w8, w0
-; CHECK-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i16_acc:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w8, w8, w0
-; CHECK-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i16_acc:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i16_acc:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i16_acc:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
@@ -1326,43 +1326,43 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1371,43 +1371,43 @@ entry:
 }
 
 define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1416,31 +1416,31 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1449,31 +1449,31 @@ entry:
 }
 
 define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1482,35 +1482,35 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1536,41 +1536,41 @@ entry:
 }
 
 define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i32_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i32_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1579,41 +1579,41 @@ entry:
 }
 
 define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i32_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.16b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv s0, v2.4s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i32_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
-; CHECK-GI-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
-; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v2.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v3.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -1622,33 +1622,33 @@ entry:
 }
 
 define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i32_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
+; CHECK-SD-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1657,33 +1657,33 @@ entry:
 }
 
 define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v8i8_v8i32_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    saddlv s0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i32_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
+; CHECK-SD-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1692,33 +1692,33 @@ entry:
 }
 
 define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
-; CHECK-BASE-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w0, w8, w0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w0, w8, w0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w8, w0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w0, w8, w0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w0, w8, w0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w0, w8, w0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1744,31 +1744,31 @@ entry:
 }
 
 define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlv h0, v0.16b
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w8, w8, w0
-; CHECK-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlv h0, v0.16b
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w8, w8, w0
-; CHECK-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i16_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -1777,31 +1777,31 @@ entry:
 }
 
 define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlv h0, v0.16b
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w8, w8, w0
-; CHECK-BASE-NEXT:    sxth w0, w8
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlv h0, v0.16b
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w8, w8, w0
-; CHECK-DOT-NEXT:    sxth w0, w8
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i16_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlv h0, v0.16b
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    sxth w0, w8
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlv h0, v0.16b
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    sxth w0, w8
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -1810,32 +1810,32 @@ entry:
 }
 
 define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
-; CHECK-BASE-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w8, w8, w0
-; CHECK-BASE-NEXT:    and w0, w8, #0xffff
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w8, w8, w0
-; CHECK-DOT-NEXT:    and w0, w8, #0xffff
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1844,32 +1844,32 @@ entry:
 }
 
 define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
-; CHECK-BASE-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w8, w8, w0
-; CHECK-BASE-NEXT:    sxth w0, w8
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w8, w8, w0
-; CHECK-DOT-NEXT:    sxth w0, w8
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxth
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    sxth w0, w8
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    sxth w0, w8
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1878,29 +1878,29 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    addv b0, v0.16b
-; CHECK-BASE-NEXT:    fmov w8, s0
-; CHECK-BASE-NEXT:    add w8, w8, w0
-; CHECK-BASE-NEXT:    and w0, w8, #0xff
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    addv b0, v0.16b
-; CHECK-DOT-NEXT:    fmov w8, s0
-; CHECK-DOT-NEXT:    add w8, w8, w0
-; CHECK-DOT-NEXT:    and w0, w8, #0xff
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv b0, v0.16b
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
-; CHECK-GI-NEXT:    and w0, w8, #0xff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
+; CHECK-SD-BASE-NEXT:    fmov w8, s0
+; CHECK-SD-BASE-NEXT:    add w8, w8, w0
+; CHECK-SD-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
+; CHECK-SD-DOT-NEXT:    fmov w8, s0
+; CHECK-SD-DOT-NEXT:    add w8, w8, w0
+; CHECK-SD-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -1908,69 +1908,69 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
-; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -1979,69 +1979,69 @@ entry:
 }
 
 define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
-; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
-; CHECK-GI-NEXT:    add v1.2d, v2.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v1.2d, v5.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -2050,46 +2050,46 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2098,46 +2098,46 @@ entry:
 }
 
 define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
-; CHECK-GI-NEXT:    add v0.2d, v1.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2146,37 +2146,37 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddlv d0, v0.4s
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddlv d0, v0.4s
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddlv d0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-BASE-NEXT:    and v2.16b, v2.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2185,47 +2185,47 @@ entry:
 }
 
 define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    ssra v1.2d, v0.2d, #56
-; CHECK-GI-NEXT:    addp d0, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    addp d0, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2234,35 +2234,35 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
-; CHECK-BASE-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x8, d0
-; CHECK-BASE-NEXT:    add x0, x8, x0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x8, d0
-; CHECK-DOT-NEXT:    add x0, x8, x0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x8, d0
+; CHECK-SD-BASE-NEXT:    add x0, x8, x0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x8, d0
+; CHECK-SD-DOT-NEXT:    add x0, x8, x0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    add x0, x8, x0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2301,28 +2301,28 @@ entry:
 }
 
 define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i32_v4i32:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i32_v4i32:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i32:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i32:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i32:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
@@ -2331,34 +2331,34 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2369,34 +2369,34 @@ entry:
 }
 
 define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-BASE-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-DOT-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-SD-BASE-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-SD-DOT-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2407,30 +2407,30 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2441,30 +2441,30 @@ entry:
 }
 
 define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2475,34 +2475,34 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v1.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-DOT-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-DOT-NEXT:    addv s0, v1.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-DOT-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-DOT-NEXT:    addv s0, v1.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -2513,34 +2513,34 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-BASE-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v1.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-DOT-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-DOT-NEXT:    addv s0, v1.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-SD-DOT-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-SD-DOT-NEXT:    addv s0, v1.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -2551,30 +2551,30 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -2585,30 +2585,30 @@ entry:
 }
 
 define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i16_v4i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i16_v4i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -2618,30 +2618,437 @@ entry:
   ret i32 %z
 }
 
-define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i16_v8i16:
+define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-BASE-LABEL: test_udot_v8i8:
 ; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-BASE-NEXT:    addv h0, v0.8h
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v2.4s
 ; CHECK-BASE-NEXT:    fmov w0, s0
 ; CHECK-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: add_pair_v8i16_v8i16:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-DOT-LABEL: test_udot_v8i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %0 = zext <8 x i8> %a to <8 x i32>
+  %1 = zext <8 x i8> %b to <8 x i32>
+  %2 = mul nuw nsw <8 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-BASE-LABEL: test_udot_v16i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    umull v4.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    umlal v4.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v4.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_udot_v16i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: test_udot_v16i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    umull v4.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v5.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umlal2 v4.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    umlal2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+entry:
+  %0 = zext <16 x i8> %a to <16 x i32>
+  %1 = zext <16 x i8> %b to <16 x i32>
+  %2 = mul nuw nsw <16 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
+; CHECK-BASE-LABEL: test_udot_v24i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ldr q0, [x0]
+; CHECK-BASE-NEXT:    ldr q1, [x1]
+; CHECK-BASE-NEXT:    ldr d4, [x0, #16]
+; CHECK-BASE-NEXT:    ldr d5, [x1, #16]
+; CHECK-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
+; CHECK-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-BASE-NEXT:    ushll v3.8h, v4.8b, #0
+; CHECK-BASE-NEXT:    ushll v4.8h, v5.8b, #0
+; CHECK-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
+; CHECK-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
+; CHECK-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
+; CHECK-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
+; CHECK-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_udot_v24i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ldr q2, [x0]
+; CHECK-SD-DOT-NEXT:    ldr q3, [x1]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x0, #16]
+; CHECK-SD-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-SD-DOT-NEXT:    udot v1.2s, v5.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    udot v0.4s, v3.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addp v1.2s, v1.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s1
+; CHECK-SD-DOT-NEXT:    fmov w9, s0
+; CHECK-SD-DOT-NEXT:    add w0, w9, w8
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %a = load <24 x i8>, ptr %p1
+  %b = load <24 x i8>, ptr %p2
+  %0 = zext <24 x i8> %a to <24 x i32>
+  %1 = zext <24 x i8> %b to <24 x i32>
+  %2 = mul nuw nsw <24 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
+; CHECK-BASE-LABEL: test_udot_v48i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ldp q0, q4, [x1]
+; CHECK-BASE-NEXT:    ldr q2, [x0, #32]
+; CHECK-BASE-NEXT:    ldp q1, q3, [x0]
+; CHECK-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
+; CHECK-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
+; CHECK-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
+; CHECK-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
+; CHECK-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-BASE-NEXT:    ushll v2.8h, v7.8b, #0
+; CHECK-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
+; CHECK-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
+; CHECK-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
+; CHECK-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
+; CHECK-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
+; CHECK-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-BASE-NEXT:    ushll v1.8h, v3.8b, #0
+; CHECK-BASE-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
+; CHECK-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
+; CHECK-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
+; CHECK-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_udot_v48i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32]
+; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32]
+; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    ldr q1, [x0]
+; CHECK-SD-DOT-NEXT:    ldr q2, [x1]
+; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #16]
+; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #16]
+; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %a = load <48 x i8>, ptr %p1
+  %b = load <48 x i8>, ptr %p2
+  %0 = zext <48 x i8> %a to <48 x i32>
+  %1 = zext <48 x i8> %b to <48 x i32>
+  %2 = mul nuw nsw <48 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-BASE-LABEL: test_sdot_v8i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h
+; CHECK-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v2.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_sdot_v8i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %0 = sext <8 x i8> %a to <8 x i32>
+  %1 = sext <8 x i8> %b to <8 x i32>
+  %2 = mul nuw nsw <8 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    smull v4.4s, v3.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-SD-BASE-NEXT:    smlal v4.4s, v1.4h, v0.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v4.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_sdot_v16i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    smull v4.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull v5.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smlal2 v4.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    smlal2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
+entry:
+  %0 = sext <16 x i8> %a to <16 x i32>
+  %1 = sext <16 x i8> %b to <16 x i32>
+  %2 = mul nuw nsw <16 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
+; CHECK-BASE-LABEL: test_sdot_v24i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ldr q0, [x0]
+; CHECK-BASE-NEXT:    ldr q1, [x1]
+; CHECK-BASE-NEXT:    ldr d4, [x0, #16]
+; CHECK-BASE-NEXT:    ldr d5, [x1, #16]
+; CHECK-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
+; CHECK-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-BASE-NEXT:    sshll v3.8h, v4.8b, #0
+; CHECK-BASE-NEXT:    sshll v4.8h, v5.8b, #0
+; CHECK-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
+; CHECK-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
+; CHECK-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
+; CHECK-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
+; CHECK-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ldr q2, [x0]
+; CHECK-SD-DOT-NEXT:    ldr q3, [x1]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x0, #16]
+; CHECK-SD-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-SD-DOT-NEXT:    sdot v1.2s, v5.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    sdot v0.4s, v3.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addp v1.2s, v1.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w8, s1
+; CHECK-SD-DOT-NEXT:    fmov w9, s0
+; CHECK-SD-DOT-NEXT:    add w0, w9, w8
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %a = load <24 x i8>, ptr %p1
+  %b = load <24 x i8>, ptr %p2
+  %0 = sext <24 x i8> %a to <24 x i32>
+  %1 = sext <24 x i8> %b to <24 x i32>
+  %2 = mul nuw nsw <24 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
+  ret i32 %3
+}
+
+define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
+; CHECK-BASE-LABEL: test_sdot_v48i8:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ldp q0, q4, [x1]
+; CHECK-BASE-NEXT:    ldr q2, [x0, #32]
+; CHECK-BASE-NEXT:    ldp q1, q3, [x0]
+; CHECK-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
+; CHECK-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
+; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
+; CHECK-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
+; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
+; CHECK-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
+; CHECK-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
+; CHECK-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-BASE-NEXT:    sshll v1.8h, v2.8b, #0
+; CHECK-BASE-NEXT:    sshll v2.8h, v7.8b, #0
+; CHECK-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
+; CHECK-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
+; CHECK-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
+; CHECK-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
+; CHECK-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
+; CHECK-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-BASE-NEXT:    sshll v1.8h, v3.8b, #0
+; CHECK-BASE-NEXT:    sshll v2.8h, v4.8b, #0
+; CHECK-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
+; CHECK-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
+; CHECK-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
+; CHECK-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
+; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-BASE-NEXT:    addv s0, v0.4s
+; CHECK-BASE-NEXT:    fmov w0, s0
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32]
+; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32]
+; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    ldr q1, [x0]
+; CHECK-SD-DOT-NEXT:    ldr q2, [x1]
+; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #16]
+; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #16]
+; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %a = load <48 x i8>, ptr %p1
+  %b = load <48 x i8>, ptr %p2
+  %0 = sext <48 x i8> %a to <48 x i32>
+  %1 = sext <48 x i8> %b to <48 x i32>
+  %2 = mul nuw nsw <48 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
+  ret i32 %3
+}
+
+; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
+define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-BASE-LABEL: test_udot_v8i8_multi_use:
+; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-BASE-NEXT:    mov v3.16b, v2.16b
+; CHECK-BASE-NEXT:    fmov w8, s2
+; CHECK-BASE-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
+; CHECK-BASE-NEXT:    addv s0, v3.4s
+; CHECK-BASE-NEXT:    fmov w9, s0
+; CHECK-BASE-NEXT:    add w0, w9, w8
+; CHECK-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-DOT-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-SD-DOT-NEXT:    addp v2.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w9, s0
+; CHECK-SD-DOT-NEXT:    fmov w8, s2
+; CHECK-SD-DOT-NEXT:    add w0, w8, w9
+; CHECK-SD-DOT-NEXT:    ret
+entry:
+  %0 = zext <8 x i8> %a to <8 x i32>
+  %1 = zext <8 x i8> %b to <8 x i32>
+  %2 = mul nuw nsw <8 x i32> %1, %0
+  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
+  %4 = extractelement <8 x i32> %2, i32 0
+  %5 = add nuw nsw i32 %3, %4
+  ret i32 %5
+}
+
+define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
+; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i16:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i16:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i16:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
@@ -2650,62 +3057,62 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2716,62 +3123,62 @@ entry:
 }
 
 define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2782,40 +3189,40 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2826,40 +3233,40 @@ entry:
 }
 
 define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-BASE-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    saddlp v1.2d, v1.4s
-; CHECK-DOT-NEXT:    sadalp v1.2d, v0.4s
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-SD-BASE-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    saddlp v1.2d, v1.4s
+; CHECK-SD-DOT-NEXT:    sadalp v1.2d, v0.4s
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2870,39 +3277,39 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x0000000000ffff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x0000000000ffff
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2913,44 +3320,44 @@ entry:
 }
 
 define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-BASE-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-BASE-NEXT:    ssra v0.2d, v1.2d, #48
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-DOT-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-DOT-NEXT:    ssra v0.2d, v1.2d, #48
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #48
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #48
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #48
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #48
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-SD-BASE-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-SD-BASE-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-SD-DOT-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-SD-DOT-NEXT:    ssra v0.2d, v1.2d, #48
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-GI-BASE-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #48
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2961,55 +3368,55 @@ entry:
 }
 
 define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    uaddl2 v4.4s, v0.8h, v2.8h
-; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
-; CHECK-BASE-NEXT:    uaddl2 v2.4s, v1.8h, v3.8h
-; CHECK-BASE-NEXT:    uaddl v1.4s, v1.4h, v3.4h
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
-; CHECK-DOT-NEXT:    addv s0, v3.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v5.4s, v0.8h
-; CHECK-GI-NEXT:    uaddw2 v3.4s, v6.4s, v3.8h
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v7.4s, v1.8h
-; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v0.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v1.8h, v3.8h
+; CHECK-SD-BASE-NEXT:    uaddl v1.4s, v1.4h, v3.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v5.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v3.4s, v6.4s, v3.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v7.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -3020,55 +3427,55 @@ entry:
 }
 
 define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    saddl2 v4.4s, v0.8h, v2.8h
-; CHECK-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
-; CHECK-BASE-NEXT:    saddl2 v2.4s, v1.8h, v3.8h
-; CHECK-BASE-NEXT:    saddl v1.4s, v1.4h, v3.4h
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
-; CHECK-DOT-NEXT:    addv s0, v3.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    saddw2 v2.4s, v4.4s, v2.8h
-; CHECK-GI-NEXT:    saddw2 v0.4s, v5.4s, v0.8h
-; CHECK-GI-NEXT:    saddw2 v3.4s, v6.4s, v3.8h
-; CHECK-GI-NEXT:    saddw2 v1.4s, v7.4s, v1.8h
-; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v0.8h, v2.8h
+; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
+; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v1.8h, v3.8h
+; CHECK-SD-BASE-NEXT:    saddl v1.4s, v1.4h, v3.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
+; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
+; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v2.4s, v4.4s, v2.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v5.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v3.4s, v6.4s, v3.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v7.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -3079,40 +3486,40 @@ entry:
 }
 
 define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v1.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v3.8b, #1
-; CHECK-DOT-NEXT:    udot v2.2s, v1.8b, v3.8b
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -3123,40 +3530,40 @@ entry:
 }
 
 define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    saddlp v1.4s, v1.8h
-; CHECK-BASE-NEXT:    sadalp v1.4s, v0.8h
-; CHECK-BASE-NEXT:    addv s0, v1.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v3.8b, #1
-; CHECK-DOT-NEXT:    sdot v2.2s, v1.8b, v3.8b
-; CHECK-DOT-NEXT:    sdot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
-; CHECK-GI-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    saddlp v1.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    sadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
+; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    sdot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -3167,37 +3574,37 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -3208,44 +3615,44 @@ entry:
 }
 
 define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-BASE-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-BASE-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-BASE-NEXT:    ssra v0.4s, v1.4s, #24
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-DOT-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-DOT-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-DOT-NEXT:    ssra v0.4s, v1.4s, #24
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-SD-BASE-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-SD-BASE-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-SD-BASE-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-SD-DOT-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-SD-DOT-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-SD-DOT-NEXT:    ssra v0.4s, v1.4s, #24
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-BASE-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-GI-BASE-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-BASE-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -3256,35 +3663,35 @@ entry:
 }
 
 define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-BASE-NEXT:    uadalp v1.8h, v0.16b
-; CHECK-BASE-NEXT:    addv h0, v1.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlp v1.8h, v1.16b
-; CHECK-DOT-NEXT:    uadalp v1.8h, v0.16b
-; CHECK-DOT-NEXT:    addv h0, v1.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-BASE-NEXT:    uadalp v1.8h, v0.16b
+; CHECK-SD-BASE-NEXT:    addv h0, v1.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlp v1.8h, v1.16b
+; CHECK-SD-DOT-NEXT:    uadalp v1.8h, v0.16b
+; CHECK-SD-DOT-NEXT:    addv h0, v1.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -3295,35 +3702,35 @@ entry:
 }
 
 define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddlp v1.8h, v1.16b
-; CHECK-BASE-NEXT:    sadalp v1.8h, v0.16b
-; CHECK-BASE-NEXT:    addv h0, v1.8h
-; CHECK-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddlp v1.8h, v1.16b
-; CHECK-DOT-NEXT:    sadalp v1.8h, v0.16b
-; CHECK-DOT-NEXT:    addv h0, v1.8h
-; CHECK-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
-; CHECK-GI-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddlp v1.8h, v1.16b
+; CHECK-SD-BASE-NEXT:    sadalp v1.8h, v0.16b
+; CHECK-SD-BASE-NEXT:    addv h0, v1.8h
+; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddlp v1.8h, v1.16b
+; CHECK-SD-DOT-NEXT:    sadalp v1.8h, v0.16b
+; CHECK-SD-DOT-NEXT:    addv h0, v1.8h
+; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-BASE-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -3334,31 +3741,31 @@ entry:
 }
 
 define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -3369,31 +3776,31 @@ entry:
 }
 
 define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    addv h0, v0.8h
-; CHECK-BASE-NEXT:    smov w0, v0.h[0]
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    addv h0, v0.8h
-; CHECK-DOT-NEXT:    smov w0, v0.h[0]
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    addv h0, v0.8h
-; CHECK-GI-NEXT:    addv h1, v1.8h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxth
-; CHECK-GI-NEXT:    sxth w0, w8
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-BASE-NEXT:    addv h0, v0.8h
+; CHECK-SD-BASE-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    addv h0, v0.8h
+; CHECK-SD-DOT-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    addv h0, v0.8h
+; CHECK-GI-BASE-NEXT:    addv h1, v1.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-BASE-NEXT:    sxth w0, w8
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -3404,29 +3811,29 @@ entry:
 }
 
 define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i8:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-BASE-NEXT:    addv b0, v0.16b
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i8:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    add v0.16b, v0.16b, v1.16b
-; CHECK-DOT-NEXT:    addv b0, v0.16b
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addv b0, v0.16b
-; CHECK-GI-NEXT:    addv b1, v1.16b
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    add w8, w9, w8, uxtb
-; CHECK-GI-NEXT:    and w0, w8, #0xff
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i8:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i8:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i8:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
+; CHECK-GI-BASE-NEXT:    addv b1, v1.16b
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxtb
+; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
@@ -3435,114 +3842,114 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-BASE-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-BASE-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
-; CHECK-BASE-NEXT:    uaddl v2.2d, v5.2s, v2.2s
-; CHECK-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v4.2s
-; CHECK-BASE-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
-; CHECK-BASE-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-BASE-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
-; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-DOT-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-DOT-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-DOT-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
-; CHECK-DOT-NEXT:    uaddl v2.2d, v5.2s, v2.2s
-; CHECK-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v4.2s
-; CHECK-DOT-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
-; CHECK-DOT-NEXT:    uaddl v6.2d, v7.2s, v6.2s
-; CHECK-DOT-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
-; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v16.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v18.2d, v5.2s, #0
-; CHECK-GI-NEXT:    ushll v19.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ushll v21.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v22.2d, v7.2s, #0
-; CHECK-GI-NEXT:    ushll v23.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-SD-BASE-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-SD-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-SD-DOT-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
+; CHECK-SD-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.2d, v4.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v18.2d, v5.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v19.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v20.2d, v6.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v21.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v22.2d, v7.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v23.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -3553,114 +3960,114 @@ entry:
 }
 
 define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-BASE-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-BASE-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
-; CHECK-BASE-NEXT:    saddl v2.2d, v5.2s, v2.2s
-; CHECK-BASE-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v4.2s
-; CHECK-BASE-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
-; CHECK-BASE-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-BASE-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
-; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    sshll2 v3.8h, v1.16b, #0
-; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-DOT-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-DOT-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-DOT-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
-; CHECK-DOT-NEXT:    saddl v2.2d, v5.2s, v2.2s
-; CHECK-DOT-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v4.2s
-; CHECK-DOT-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
-; CHECK-DOT-NEXT:    saddl v6.2d, v7.2s, v6.2s
-; CHECK-DOT-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
-; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v16.2d, v4.2s, #0
-; CHECK-GI-NEXT:    sshll v17.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v18.2d, v5.2s, #0
-; CHECK-GI-NEXT:    sshll v19.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT:    sshll v21.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v22.2d, v7.2s, #0
-; CHECK-GI-NEXT:    sshll v23.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
-; CHECK-GI-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
-; CHECK-GI-NEXT:    add v2.2d, v4.2d, v2.2d
-; CHECK-GI-NEXT:    add v0.2d, v5.2d, v0.2d
-; CHECK-GI-NEXT:    add v3.2d, v6.2d, v3.2d
-; CHECK-GI-NEXT:    add v1.2d, v7.2d, v1.2d
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-SD-BASE-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-SD-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-BASE-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-SD-DOT-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
+; CHECK-SD-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-DOT-NEXT:    add v3.2d, v5.2d, v16.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.2d, v4.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v17.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v18.2d, v5.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v19.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v20.2d, v6.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v21.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v22.2d, v7.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v23.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -3671,68 +4078,68 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
-; CHECK-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-BASE-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-DOT-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -3743,68 +4150,68 @@ entry:
 }
 
 define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-BASE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-BASE-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-DOT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-DOT-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
-; CHECK-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
-; CHECK-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
-; CHECK-GI-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
-; CHECK-GI-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
-; CHECK-GI-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
-; CHECK-GI-NEXT:    add v0.2d, v2.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-BASE-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-BASE-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-BASE-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-BASE-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-SD-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
+; CHECK-SD-DOT-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-SD-DOT-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-SD-DOT-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-BASE-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-BASE-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -3815,51 +4222,51 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-BASE-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-BASE-NEXT:    addp d0, v1.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    uaddlp v1.2d, v1.4s
-; CHECK-DOT-NEXT:    uadalp v1.2d, v0.4s
-; CHECK-DOT-NEXT:    addp d0, v1.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    add v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-BASE-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-BASE-NEXT:    addp d0, v1.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    uaddlp v1.2d, v1.4s
+; CHECK-SD-DOT-NEXT:    uadalp v1.2d, v0.4s
+; CHECK-SD-DOT-NEXT:    addp d0, v1.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-BASE-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-BASE-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-BASE-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -3870,70 +4277,70 @@ entry:
 }
 
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BASE-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-BASE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-BASE-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-BASE-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-BASE-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-BASE-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    add v0.2d, v2.2d, v3.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-DOT-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-DOT-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    ushll v3.2d, v1.2s, #0
-; CHECK-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-DOT-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-DOT-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-DOT-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-DOT-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    add v0.2d, v2.2d, v3.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    shl v2.2d, v2.2d, #56
-; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #56
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-GI-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-GI-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-GI-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-GI-NEXT:    addp d0, v2.2d
-; CHECK-GI-NEXT:    addp d1, v3.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-BASE-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-BASE-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-BASE-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-SD-BASE-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-SD-BASE-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-SD-BASE-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-SD-DOT-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-SD-DOT-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-SD-DOT-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-SD-DOT-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.2d, v0.4s, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-GI-BASE-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-GI-BASE-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-GI-BASE-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    addp d0, v2.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v3.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -3944,39 +4351,39 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-BASE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-BASE-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-SD-DOT-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-DOT-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -3987,44 +4394,44 @@ entry:
 }
 
 define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BASE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BASE-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-BASE-NEXT:    ssra v0.2d, v1.2d, #56
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-DOT-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-DOT-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-DOT-NEXT:    ssra v0.2d, v1.2d, #56
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-SD-BASE-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-SD-DOT-NEXT:    ssra v0.2d, v1.2d, #56
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-BASE-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-GI-BASE-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -4035,61 +4442,61 @@ entry:
 }
 
 define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
-; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-BASE-NEXT:    saddlp v3.4s, v3.8h
-; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-BASE-NEXT:    sadalp v3.4s, v2.8h
-; CHECK-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v5.8b, #1
-; CHECK-DOT-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v6.2s, v1.8b, v5.8b
-; CHECK-DOT-NEXT:    sdot v4.2s, v3.8b, v5.8b
-; CHECK-DOT-NEXT:    udot v6.2s, v0.8b, v5.8b
-; CHECK-DOT-NEXT:    sdot v4.2s, v2.8b, v5.8b
-; CHECK-DOT-NEXT:    add v0.2s, v6.2s, v4.2s
-; CHECK-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v3.4h, #0
-; CHECK-GI-NEXT:    uaddw2 v0.4s, v4.4s, v0.8h
-; CHECK-GI-NEXT:    uaddw2 v1.4s, v5.4s, v1.8h
-; CHECK-GI-NEXT:    saddw2 v2.4s, v6.4s, v2.8h
-; CHECK-GI-NEXT:    saddw2 v3.4s, v7.4s, v3.8h
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    addv s2, v2.4s
-; CHECK-GI-NEXT:    addv s3, v3.4s
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    add w9, w10, w11
-; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    saddlp v3.4s, v3.8h
+; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    sadalp v3.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v5.8b, #1
+; CHECK-SD-DOT-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    udot v6.2s, v1.8b, v5.8b
+; CHECK-SD-DOT-NEXT:    sdot v4.2s, v3.8b, v5.8b
+; CHECK-SD-DOT-NEXT:    udot v6.2s, v0.8b, v5.8b
+; CHECK-SD-DOT-NEXT:    sdot v4.2s, v2.8b, v5.8b
+; CHECK-SD-DOT-NEXT:    add v0.2s, v6.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v6.4s, v2.4h, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.4s, v3.4h, #0
+; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v4.4s, v0.8h
+; CHECK-GI-BASE-NEXT:    uaddw2 v1.4s, v5.4s, v1.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v2.4s, v6.4s, v2.8h
+; CHECK-GI-BASE-NEXT:    saddw2 v3.4s, v7.4s, v3.8h
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    fmov w10, s2
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %axx = zext <8 x i8> %ax to <8 x i32>
   %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
@@ -4106,48 +4513,48 @@ entry:
 }
 
 define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
-; CHECK-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-BASE-NEXT:    uaddlp v3.4s, v3.8h
-; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-BASE-NEXT:    uadalp v3.4s, v2.8h
-; CHECK-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-BASE-NEXT:    addv s0, v0.4s
-; CHECK-BASE-NEXT:    fmov w0, s0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-DOT-NEXT:    uaddlp v3.4s, v3.8h
-; CHECK-DOT-NEXT:    uadalp v1.4s, v0.8h
-; CHECK-DOT-NEXT:    uadalp v3.4s, v2.8h
-; CHECK-DOT-NEXT:    add v0.4s, v3.4s, v1.4s
-; CHECK-DOT-NEXT:    addv s0, v0.4s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v5.4s, v1.4s
-; CHECK-GI-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-GI-NEXT:    add v3.4s, v7.4s, v3.4s
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v0.4s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-BASE-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
+; CHECK-SD-BASE-NEXT:    fmov w0, s0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-SD-DOT-NEXT:    uaddlp v3.4s, v3.8h
+; CHECK-SD-DOT-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-SD-DOT-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-SD-DOT-NEXT:    add v0.4s, v3.4s, v1.4s
+; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.4s, v3.4h, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-GI-BASE-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    add v3.4s, v7.4s, v3.4s
+; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-BASE-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %axx = zext <8 x i16> %ax to <8 x i32>
   %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -4173,28 +4580,28 @@ entry:
 }
 
 define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
-; CHECK-BASE-LABEL: add_pair_v2i64_v2i64:
-; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-BASE-NEXT:    addp d0, v0.2d
-; CHECK-BASE-NEXT:    fmov x0, d0
-; CHECK-BASE-NEXT:    ret
-;
-; CHECK-DOT-LABEL: add_pair_v2i64_v2i64:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-DOT-NEXT:    addp d0, v0.2d
-; CHECK-DOT-NEXT:    fmov x0, d0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    addp d1, v1.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    add x0, x8, x9
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-BASE-LABEL: add_pair_v2i64_v2i64:
+; CHECK-SD-BASE:       // %bb.0: // %entry
+; CHECK-SD-BASE-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-BASE-NEXT:    addp d0, v0.2d
+; CHECK-SD-BASE-NEXT:    fmov x0, d0
+; CHECK-SD-BASE-NEXT:    ret
+;
+; CHECK-SD-DOT-LABEL: add_pair_v2i64_v2i64:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-DOT-NEXT:    addp d0, v0.2d
+; CHECK-SD-DOT-NEXT:    fmov x0, d0
+; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-BASE-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI-BASE:       // %bb.0: // %entry
+; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
+; CHECK-GI-BASE-NEXT:    addp d1, v1.2d
+; CHECK-GI-BASE-NEXT:    fmov x8, d0
+; CHECK-GI-BASE-NEXT:    fmov x9, d1
+; CHECK-GI-BASE-NEXT:    add x0, x8, x9
+; CHECK-GI-BASE-NEXT:    ret
 entry:
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
@@ -4257,117 +4664,61 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-BASE-NEXT:    fmov w0, s0
 ; CHECK-BASE-NEXT:    ret
 ;
-; CHECK-DOT-LABEL: full:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    ldr d0, [x0]
-; CHECK-DOT-NEXT:    ldr d1, [x2]
-; CHECK-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-DOT-NEXT:    sxtw x8, w3
-; CHECK-DOT-NEXT:    sxtw x9, w1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    movi v3.8b, #1
-; CHECK-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    add x11, x2, x8
-; CHECK-DOT-NEXT:    add x10, x0, x9
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
-; CHECK-DOT-NEXT:    add x11, x11, x8
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
-; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10, x9]
-; CHECK-DOT-NEXT:    ldr d4, [x11, x8]
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-DOT-NEXT:    fmov w0, s0
-; CHECK-DOT-NEXT:    ret
-;
-; CHECK-GI-LABEL: full:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr d0, [x0]
-; CHECK-GI-NEXT:    ldr d1, [x2]
-; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    sxtw x8, w3
-; CHECK-GI-NEXT:    sxtw x9, w1
-; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT:    movi v3.8b, #1
-; CHECK-GI-NEXT:    uabd v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    add x11, x2, x8
-; CHECK-GI-NEXT:    add x10, x0, x9
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    add x10, x10, x9
-; CHECK-GI-NEXT:    add x11, x11, x8
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ldr d4, [x11]
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    ldr d1, [x10, x9]
-; CHECK-GI-NEXT:    ldr d4, [x11, x8]
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-GI-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-GI-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-SD-DOT-LABEL: full:
+; CHECK-SD-DOT:       // %bb.0: // %entry
+; CHECK-SD-DOT-NEXT:    ldr d0, [x0]
+; CHECK-SD-DOT-NEXT:    ldr d1, [x2]
+; CHECK-SD-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-SD-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-DOT-NEXT:    sxtw x8, w3
+; CHECK-SD-DOT-NEXT:    sxtw x9, w1
+; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
+; CHECK-SD-DOT-NEXT:    add x11, x2, x8
+; CHECK-SD-DOT-NEXT:    add x10, x0, x9
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    add x10, x10, x9
+; CHECK-SD-DOT-NEXT:    add x11, x11, x8
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    ldr d1, [x10, x9]
+; CHECK-SD-DOT-NEXT:    ldr d4, [x11, x8]
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    fmov w0, s0
+; CHECK-SD-DOT-NEXT:    ret
 entry:
   %idx.ext8 = sext i32 %s2 to i64
   %idx.ext = sext i32 %s1 to i64
@@ -4459,6 +4810,8 @@ declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
+declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)

>From e2a32e3c4f3f247d1109ab95ccdc3e1731b7f308 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Mon, 30 Oct 2023 09:51:47 +0000
Subject: [PATCH 2/2] [AArch64][GlobalISel] Support udot lowering for vecreduce
 add

vecreduce_add(mul(ext, ext)) -> vecreduce_add(udot)
vecreduce_add(ext) -> vecreduce_add(ext)

Vectors of scalar size of 8-bits with element count of multiples of 8
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |   14 +-
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |   15 +
 .../GISel/AArch64PreLegalizerCombiner.cpp     |  193 ++
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 2062 ++++++++++++++++-
 4 files changed, 2165 insertions(+), 119 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 017c4523c23a184..7ff0ada30acfe47 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -33,12 +33,24 @@ def fold_global_offset : GICombineRule<
   (apply [{ applyFoldGlobalOffset(*${root}, MRI, B, Observer, ${matchinfo});}])
 >;
 
+// Boolean: 0 = G_ZEXT, 1 = G_SEXT
+def ext_addv_to_udot_addv_matchinfo : GIDefMatchData<"std::tuple<Register, Register, bool>">;
+let Predicates = [HasDotProd] in {
+def ext_addv_to_udot_addv : GICombineRule<
+  (defs root:$root, ext_addv_to_udot_addv_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_VECREDUCE_ADD):$root,
+         [{ return matchExtAddvToUdotAddv(*${root}, MRI, STI, ${matchinfo}); }]),
+  (apply [{ applyExtAddvToUdotAddv(*${root}, MRI, B, Observer, STI, ${matchinfo}); }])
+>;
+}
+
 def AArch64PreLegalizerCombiner: GICombiner<
   "AArch64PreLegalizerCombinerImpl", [all_combines,
                                       fconstant_to_constant,
                                       icmp_redundant_trunc,
                                       fold_global_offset,
-                                      shuffle_to_extract]> {
+                                      shuffle_to_extract,
+                                      ext_addv_to_udot_addv]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 27338bd24393325..1711360779bf74c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -227,6 +227,18 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_UDOT : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+  let hasSideEffects = 0;
+}
+
+def G_SDOT : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -270,6 +282,9 @@ def : GINodeEquiv<G_BSP, AArch64bsp>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
+def : GINodeEquiv<G_UDOT, AArch64udot>;
+def : GINodeEquiv<G_SDOT, AArch64sdot>;
+
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
 def : GINodeEquiv<G_PREFETCH, AArch64Prefetch>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index d9678bea214dd53..d8d13cfa60b0241 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -228,6 +228,199 @@ void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
       B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
 }
 
+// Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
+// Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
+// Similar to performVecReduceAddCombine in SelectionDAG
+bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
+                            const AArch64Subtarget &STI,
+                            std::tuple<Register, Register, bool> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
+         "Expected a G_VECREDUCE_ADD instruction");
+  assert(STI.hasDotProd() && "Target should have Dot Product feature");
+
+  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  Register DstReg = MI.getOperand(0).getReg();
+  Register MidReg = I1->getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT MidTy = MRI.getType(MidReg);
+  if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
+    return false;
+
+  LLT SrcTy;
+  auto I1Opc = I1->getOpcode();
+  if (I1Opc == TargetOpcode::G_MUL) {
+    // If result of this has more than 1 use, then there is no point in creating
+    // udot instruction
+    if (!MRI.hasOneNonDBGUse(MidReg))
+      return false;
+
+    MachineInstr *ExtMI1 =
+        getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI);
+    MachineInstr *ExtMI2 =
+        getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI);
+    LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg());
+    LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg());
+
+    if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
+      return false;
+    I1Opc = ExtMI1->getOpcode();
+    SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg());
+    std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg();
+    std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg();
+  } else {
+    SrcTy = MRI.getType(I1->getOperand(1).getReg());
+    std::get<0>(MatchInfo) = I1->getOperand(1).getReg();
+    std::get<1>(MatchInfo) = 0;
+  }
+
+  if (I1Opc == TargetOpcode::G_ZEXT)
+    std::get<2>(MatchInfo) = 0;
+  else if (I1Opc == TargetOpcode::G_SEXT)
+    std::get<2>(MatchInfo) = 1;
+  else
+    return false;
+
+  if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0)
+    return false;
+
+  return true;
+}
+
+void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
+                            MachineIRBuilder &Builder,
+                            GISelChangeObserver &Observer,
+                            const AArch64Subtarget &STI,
+                            std::tuple<Register, Register, bool> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
+         "Expected a G_VECREDUCE_ADD instruction");
+  assert(STI.hasDotProd() && "Target should have Dot Product feature");
+
+  // Initialise the variables
+  unsigned DotOpcode =
+      std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
+  Register Ext1SrcReg = std::get<0>(MatchInfo);
+
+  // If there is one source register, create a vector of 0s as the second
+  // source register
+  Register Ext2SrcReg;
+  if (std::get<1>(MatchInfo) == 0)
+    Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1)
+                     ->getOperand(0)
+                     .getReg();
+  else
+    Ext2SrcReg = std::get<1>(MatchInfo);
+
+  // Find out how many DOT instructions are needed
+  LLT SrcTy = MRI.getType(Ext1SrcReg);
+  LLT MidTy;
+  unsigned NumOfDotMI;
+  if (SrcTy.getNumElements() % 16 == 0) {
+    NumOfDotMI = SrcTy.getNumElements() / 16;
+    MidTy = LLT::fixed_vector(4, 32);
+  } else if (SrcTy.getNumElements() % 8 == 0) {
+    NumOfDotMI = SrcTy.getNumElements() / 8;
+    MidTy = LLT::fixed_vector(2, 32);
+  } else {
+    llvm_unreachable("Source type number of elements is not multiple of 8");
+  }
+
+  // Handle case where one DOT instruction is needed
+  if (NumOfDotMI == 1) {
+    auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg();
+    auto Dot = Builder.buildInstr(DotOpcode, {MidTy},
+                                  {Zeroes, Ext1SrcReg, Ext2SrcReg});
+    Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0));
+  } else {
+    // If not pad the last v8 element with 0s to a v16
+    SmallVector<Register, 4> Ext1UnmergeReg;
+    SmallVector<Register, 4> Ext2UnmergeReg;
+    if (SrcTy.getNumElements() % 16 != 0) {
+      // Unmerge source to v8i8, append a new v8i8 of 0s and the merge to v16s
+      SmallVector<Register, 4> PadUnmergeDstReg1;
+      SmallVector<Register, 4> PadUnmergeDstReg2;
+      unsigned NumOfVec = SrcTy.getNumElements() / 8;
+
+      // Unmerge the source to v8i8
+      MachineInstr *PadUnmerge1 =
+          Builder.buildUnmerge(LLT::fixed_vector(8, 8), Ext1SrcReg);
+      MachineInstr *PadUnmerge2 =
+          Builder.buildUnmerge(LLT::fixed_vector(8, 8), Ext2SrcReg);
+      for (unsigned i = 0; i < NumOfVec; i++) {
+        PadUnmergeDstReg1.push_back(PadUnmerge1->getOperand(i).getReg());
+        PadUnmergeDstReg2.push_back(PadUnmerge2->getOperand(i).getReg());
+      }
+
+      // Pad the vectors with a v8i8 constant of 0s
+      MachineInstr *v8Zeroes =
+          Builder.buildConstant(LLT::fixed_vector(8, 8), 0);
+      PadUnmergeDstReg1.push_back(v8Zeroes->getOperand(0).getReg());
+      PadUnmergeDstReg2.push_back(v8Zeroes->getOperand(0).getReg());
+
+      // Merge them all back to v16i8
+      NumOfVec = (NumOfVec + 1) / 2;
+      for (unsigned i = 0; i < NumOfVec; i++) {
+        Ext1UnmergeReg.push_back(
+            Builder
+                .buildMergeLikeInstr(
+                    LLT::fixed_vector(16, 8),
+                    {PadUnmergeDstReg1[i * 2], PadUnmergeDstReg1[(i * 2) + 1]})
+                ->getOperand(0)
+                .getReg());
+        Ext2UnmergeReg.push_back(
+            Builder
+                .buildMergeLikeInstr(
+                    LLT::fixed_vector(16, 8),
+                    {PadUnmergeDstReg2[i * 2], PadUnmergeDstReg2[(i * 2) + 1]})
+                ->getOperand(0)
+                .getReg());
+      }
+    } else {
+      // Unmerge the source vectors to v16i8
+      MachineInstr *Ext1Unmerge =
+          Builder.buildUnmerge(LLT::fixed_vector(16, 8), Ext1SrcReg);
+      MachineInstr *Ext2Unmerge =
+          Builder.buildUnmerge(LLT::fixed_vector(16, 8), Ext2SrcReg);
+      for (unsigned i = 0; i < SrcTy.getNumElements() / 16; i++) {
+        Ext1UnmergeReg.push_back(Ext1Unmerge->getOperand(i).getReg());
+        Ext2UnmergeReg.push_back(Ext2Unmerge->getOperand(i).getReg());
+      }
+    }
+
+    // Build the UDOT instructions
+    SmallVector<Register, 2> DotReg;
+    unsigned NumElements = 0;
+    for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) {
+      LLT ZeroesLLT;
+      // Check if it is 16 or 8 elements. Set Zeroes to the according size
+      if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) {
+        ZeroesLLT = LLT::fixed_vector(4, 32);
+        NumElements += 4;
+      } else {
+        ZeroesLLT = LLT::fixed_vector(2, 32);
+        NumElements += 2;
+      }
+      auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg();
+      DotReg.push_back(
+          Builder
+              .buildInstr(DotOpcode, {MRI.getType(Zeroes)},
+                          {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]})
+              ->getOperand(0)
+              .getReg());
+    }
+
+    // Merge the output
+    auto ConcatMI =
+        Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg);
+
+    // Put it through a vector reduction
+    Builder.buildVecReduceAdd(MI.getOperand(0).getReg(),
+                              ConcatMI->getOperand(0).getReg());
+  }
+
+  // Erase the dead instructions
+  MI.eraseFromParent();
+}
+
 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
                         CombinerHelper &Helper, GISelChangeObserver &Observer) {
   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 7dd275242425963..f788cd71f19465b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-SD-BASE
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD-DOT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-SD-DOT
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-GI-BASE
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-GI-DOT
 
 ; CHECK-GI-BASE:        warning: Instruction selection used fallback path for full
 
@@ -69,6 +70,14 @@ define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -95,6 +104,14 @@ define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -147,6 +164,14 @@ define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -173,6 +198,14 @@ define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -224,6 +257,13 @@ define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    uxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i16:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    uxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   ret i16 %z
@@ -264,6 +304,19 @@ define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -305,6 +358,19 @@ define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -334,6 +400,15 @@ define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -363,6 +438,15 @@ define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -396,6 +480,15 @@ define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -429,14 +522,14 @@ define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
-; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -468,14 +561,14 @@ define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
-; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -520,6 +613,15 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -551,6 +653,15 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -582,6 +693,15 @@ define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -626,6 +746,15 @@ define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    uxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    uxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -655,6 +784,15 @@ define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    sxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -683,6 +821,14 @@ define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    uxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    uxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -711,6 +857,14 @@ define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    sxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -736,6 +890,13 @@ define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    uxtb w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    uxtb w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   ret i8 %z
@@ -802,6 +963,29 @@ define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -869,6 +1053,29 @@ define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -913,6 +1120,20 @@ define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -957,6 +1178,20 @@ define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -992,6 +1227,19 @@ define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-DOT-NEXT:    and v2.16b, v2.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1037,6 +1285,19 @@ define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v1.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    addp d0, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1070,6 +1331,15 @@ define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
 ; CHECK-GI-BASE-NEXT:    addp d0, v0.2d
 ; CHECK-GI-BASE-NEXT:    fmov x0, d0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x0, d0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1138,6 +1408,15 @@ define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1168,6 +1447,15 @@ define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1228,6 +1516,15 @@ define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1258,6 +1555,15 @@ define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1319,6 +1625,14 @@ define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
 ; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i16_acc:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %r = add i16 %z, %a
@@ -1363,6 +1677,20 @@ define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1408,6 +1736,20 @@ define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -1441,6 +1783,16 @@ define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1474,6 +1826,16 @@ define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -1511,6 +1873,16 @@ define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000000000ffff
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1548,15 +1920,15 @@ define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
 ; CHECK-SD-BASE-NEXT:    add w0, w8, w0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
-; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w0, w8, w0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -1591,15 +1963,15 @@ define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
 ; CHECK-SD-BASE-NEXT:    add w0, w8, w0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
-; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w0, w8, w0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.16b, #1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w8, s0
+; CHECK-DOT-NEXT:    add w0, w8, w0
+; CHECK-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -1649,6 +2021,16 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1684,6 +2066,16 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -1719,6 +2111,16 @@ define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w0, w8, w0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -1769,6 +2171,16 @@ define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
 ; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -1802,6 +2214,16 @@ define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
 ; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.8h, v1.8h, v0.16b
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-DOT-NEXT:    sxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -1836,6 +2258,15 @@ define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
 ; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1870,6 +2301,15 @@ define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
 ; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxth
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxth
+; CHECK-GI-DOT-NEXT:    sxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -1901,6 +2341,14 @@ define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
 ; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxtb
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -1971,6 +2419,30 @@ define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v7.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -2042,6 +2514,30 @@ define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v7.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v5.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v7.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v1.2d, v2.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -2090,6 +2586,21 @@ define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2138,6 +2649,21 @@ define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v2.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v3.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -2177,6 +2703,20 @@ define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-DOT-NEXT:    and v2.16b, v2.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2226,6 +2766,20 @@ define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    ssra v1.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    addp d0, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2263,6 +2817,16 @@ define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
 ; CHECK-GI-BASE-NEXT:    fmov x8, d0
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    add x0, x8, x0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2323,6 +2887,15 @@ define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i32:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
   %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
@@ -2359,6 +2932,19 @@ define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2397,6 +2983,19 @@ define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i32> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -2431,6 +3030,17 @@ define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2465,6 +3075,17 @@ define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <2 x i32> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2503,6 +3124,19 @@ define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -2541,6 +3175,19 @@ define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.4s, v2.4s, v0.8h
+; CHECK-GI-DOT-NEXT:    saddw2 v1.4s, v3.4s, v1.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -2575,6 +3222,17 @@ define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -2609,6 +3267,17 @@ define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -2629,13 +3298,13 @@ define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-BASE-NEXT:    fmov w0, s0
 ; CHECK-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: test_udot_v8i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
-; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: test_udot_v8i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 entry:
   %0 = zext <8 x i8> %a to <8 x i32>
   %1 = zext <8 x i8> %b to <8 x i32>
@@ -2660,13 +3329,13 @@ define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: test_udot_v16i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
-; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: test_udot_v16i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v16i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -2730,6 +3399,111 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    fmov w9, s0
 ; CHECK-SD-DOT-NEXT:    add w0, w9, w8
 ; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_udot_v24i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #8]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
+; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #1]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
+; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #9]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #17]
+; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x1, #17]
+; CHECK-GI-DOT-NEXT:    mov v6.b[1], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #2]
+; CHECK-GI-DOT-NEXT:    mov v2.b[1], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #10]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #2]
+; CHECK-GI-DOT-NEXT:    mov v5.b[1], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #10]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #18]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #18]
+; CHECK-GI-DOT-NEXT:    mov v3.b[2], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[2], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #3]
+; CHECK-GI-DOT-NEXT:    mov v2.b[2], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #3]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #11]
+; CHECK-GI-DOT-NEXT:    mov v5.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #19]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
+; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[3], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[3], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
+; CHECK-GI-DOT-NEXT:    mov v2.b[3], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #4]
+; CHECK-GI-DOT-NEXT:    mov v5.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #12]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #12]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #20]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #20]
+; CHECK-GI-DOT-NEXT:    mov v3.b[4], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[4], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[4], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #5]
+; CHECK-GI-DOT-NEXT:    mov v2.b[4], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #5]
+; CHECK-GI-DOT-NEXT:    mov v5.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #13]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #21]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
+; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[5], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[5], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
+; CHECK-GI-DOT-NEXT:    mov v2.b[5], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #6]
+; CHECK-GI-DOT-NEXT:    mov v5.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #14]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #14]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #22]
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #22]
+; CHECK-GI-DOT-NEXT:    mov v3.b[6], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[6], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[6], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #7]
+; CHECK-GI-DOT-NEXT:    mov v2.b[6], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #7]
+; CHECK-GI-DOT-NEXT:    mov v5.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #15]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #23]
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
+; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[7], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[7], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v2.b[7], v19.b[0]
+; CHECK-GI-DOT-NEXT:    mov v5.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.d[1], v6.d[0]
+; CHECK-GI-DOT-NEXT:    mov v1.d[1], v3.d[0]
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v2.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    mov v5.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    udot v3.4s, v4.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <24 x i8>, ptr %p1
   %b = load <24 x i8>, ptr %p2
@@ -2793,6 +3567,210 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-SD-DOT-NEXT:    fmov w0, s0
 ; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_udot_v48i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
+; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
+; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
+; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
+; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
+; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
+; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
+; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
+; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
+; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
+; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
+; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
+; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
+; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
+; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
+; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
+; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
+; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
+; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
+; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
+; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
+; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
+; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
+; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
+; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
+; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
+; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
+; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
+; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
+; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
+; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
+; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
+; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
+; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
+; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
+; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
+; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
+; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
+; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
+; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
+; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
+; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
+; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
+; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
+; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
+; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
+; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
+; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
+; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
+; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
+; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
+; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
+; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
+; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
+; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
+; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
+; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
+; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
+; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
+; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
+; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    udot v7.4s, v5.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    fmov w10, s2
+; CHECK-GI-DOT-NEXT:    add w8, w8, w9
+; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
   %b = load <48 x i8>, ptr %p2
@@ -2814,13 +3792,13 @@ define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-BASE-NEXT:    fmov w0, s0
 ; CHECK-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: test_sdot_v8i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
-; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: test_sdot_v8i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 entry:
   %0 = sext <8 x i8> %a to <8 x i32>
   %1 = sext <8 x i8> %b to <8 x i32>
@@ -2845,13 +3823,13 @@ define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: test_sdot_v16i8:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
-; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: test_sdot_v16i8:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
+; CHECK-DOT-NEXT:    addv s0, v2.4s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
@@ -2915,6 +3893,111 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    fmov w9, s0
 ; CHECK-SD-DOT-NEXT:    add w0, w9, w8
 ; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #8]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
+; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #1]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
+; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #9]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #17]
+; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x1, #17]
+; CHECK-GI-DOT-NEXT:    mov v6.b[1], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #2]
+; CHECK-GI-DOT-NEXT:    mov v2.b[1], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #10]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #2]
+; CHECK-GI-DOT-NEXT:    mov v5.b[1], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #10]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #18]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #18]
+; CHECK-GI-DOT-NEXT:    mov v3.b[2], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[2], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #3]
+; CHECK-GI-DOT-NEXT:    mov v2.b[2], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #3]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #11]
+; CHECK-GI-DOT-NEXT:    mov v5.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #19]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
+; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[3], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[3], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
+; CHECK-GI-DOT-NEXT:    mov v2.b[3], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #4]
+; CHECK-GI-DOT-NEXT:    mov v5.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #12]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #12]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #20]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #20]
+; CHECK-GI-DOT-NEXT:    mov v3.b[4], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[4], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[4], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #5]
+; CHECK-GI-DOT-NEXT:    mov v2.b[4], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #5]
+; CHECK-GI-DOT-NEXT:    mov v5.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #13]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #21]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
+; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[5], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[5], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
+; CHECK-GI-DOT-NEXT:    mov v2.b[5], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #6]
+; CHECK-GI-DOT-NEXT:    mov v5.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #14]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #14]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #22]
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #22]
+; CHECK-GI-DOT-NEXT:    mov v3.b[6], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[6], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[6], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #7]
+; CHECK-GI-DOT-NEXT:    mov v2.b[6], v19.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #7]
+; CHECK-GI-DOT-NEXT:    mov v5.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #15]
+; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #23]
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
+; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.b[7], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[7], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v2.b[7], v19.b[0]
+; CHECK-GI-DOT-NEXT:    mov v5.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v4.d[1], v6.d[0]
+; CHECK-GI-DOT-NEXT:    mov v1.d[1], v3.d[0]
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v2.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    mov v5.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    sdot v3.4s, v4.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <24 x i8>, ptr %p1
   %b = load <24 x i8>, ptr %p2
@@ -2978,6 +4061,210 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-SD-DOT-NEXT:    fmov w0, s0
 ; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
+; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
+; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
+; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
+; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
+; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
+; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
+; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
+; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
+; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
+; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
+; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
+; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
+; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
+; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
+; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
+; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
+; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
+; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
+; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
+; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
+; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
+; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
+; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
+; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
+; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
+; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
+; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
+; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
+; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
+; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
+; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
+; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
+; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
+; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
+; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
+; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
+; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
+; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
+; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
+; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
+; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
+; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
+; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
+; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
+; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
+; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
+; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
+; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
+; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
+; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
+; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
+; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
+; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
+; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
+; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
+; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
+; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
+; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
+; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
+; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
+; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
+; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
+; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
+; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    sdot v7.4s, v5.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    fmov w10, s2
+; CHECK-GI-DOT-NEXT:    add w8, w8, w9
+; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
   %b = load <48 x i8>, ptr %p2
@@ -3015,6 +4302,19 @@ define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-DOT-NEXT:    fmov w8, s2
 ; CHECK-SD-DOT-NEXT:    add w0, w8, w9
 ; CHECK-SD-DOT-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: test_udot_v8i8_multi_use:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    umull v2.4s, v1.4h, v0.4h
+; CHECK-GI-DOT-NEXT:    mov v3.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    fmov w8, s2
+; CHECK-GI-DOT-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
+; CHECK-GI-DOT-NEXT:    addv s0, v3.4s
+; CHECK-GI-DOT-NEXT:    fmov w9, s0
+; CHECK-GI-DOT-NEXT:    add w0, w9, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %0 = zext <8 x i8> %a to <8 x i32>
   %1 = zext <8 x i8> %b to <8 x i32>
@@ -3049,6 +4349,16 @@ define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i16:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
   %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
@@ -3113,6 +4423,29 @@ define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -3179,6 +4512,29 @@ define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i16> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -3223,6 +4579,21 @@ define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -3267,6 +4638,21 @@ define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v2.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v3.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i16> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -3310,6 +4696,20 @@ define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x0000000000ffff
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -3358,6 +4758,21 @@ define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #48
+; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #48
+; CHECK-GI-DOT-NEXT:    sshr v0.2d, v0.2d, #48
+; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #48
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <2 x i16> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -3417,6 +4832,20 @@ define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v4.4s, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v4.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v3.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -3476,6 +4905,20 @@ define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v4.4s, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v4.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v3.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
@@ -3520,6 +4963,20 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v4.2s, v0.8b, v2.8b
+; CHECK-GI-DOT-NEXT:    udot v3.2s, v1.8b, v2.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v4.2s, v4.2s
+; CHECK-GI-DOT-NEXT:    addp v1.2s, v3.2s, v3.2s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -3564,6 +5021,20 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v4.2s, v0.8b, v2.8b
+; CHECK-GI-DOT-NEXT:    sdot v3.2s, v1.8b, v2.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v4.2s, v4.2s
+; CHECK-GI-DOT-NEXT:    addp v1.2s, v3.2s, v3.2s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
@@ -3605,6 +5076,20 @@ define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -3653,6 +5138,21 @@ define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-GI-DOT-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-GI-DOT-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-GI-DOT-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i32>
   %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
@@ -3692,6 +5192,20 @@ define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.8h, v3.8h, v1.16b
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -3731,6 +5245,20 @@ define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-DOT-NEXT:    saddw2 v1.8h, v3.8h, v1.16b
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-DOT-NEXT:    sxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
@@ -3766,6 +5294,18 @@ define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -3801,6 +5341,18 @@ define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxth
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    addv h0, v0.8h
+; CHECK-GI-DOT-NEXT:    addv h1, v1.8h
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxth
+; CHECK-GI-DOT-NEXT:    sxth w0, w8
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i16>
   %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
@@ -3834,6 +5386,16 @@ define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    add w8, w9, w8, uxtb
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i8:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
+; CHECK-GI-DOT-NEXT:    addv b1, v1.16b
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    add w8, w9, w8, uxtb
+; CHECK-GI-DOT-NEXT:    and w0, w8, #0xff
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
@@ -3950,6 +5512,49 @@ define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-DOT-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v16.2d, v4.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v18.2d, v5.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v19.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v20.2d, v6.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v21.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v22.2d, v7.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v23.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -4068,6 +5673,49 @@ define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-DOT-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v7.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v16.2d, v4.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v17.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v18.2d, v5.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v19.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v20.2d, v6.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v21.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v22.2d, v7.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v23.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v4.2d, v16.2d, v4.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v17.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v5.2d, v18.2d, v5.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v19.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v6.2d, v20.2d, v6.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v21.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v7.2d, v22.2d, v7.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v23.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v3.2d, v6.2d, v3.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v7.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <16 x i8> %x to <16 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@@ -4140,6 +5788,31 @@ define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v5.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v7.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    uaddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    uaddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -4212,6 +5885,31 @@ define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-DOT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v5.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v6.2d, v3.2s, #0
+; CHECK-GI-DOT-NEXT:    sshll v7.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    saddw2 v2.2d, v4.2d, v2.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v0.2d, v5.2d, v0.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v3.2d, v6.2d, v3.4s
+; CHECK-GI-DOT-NEXT:    saddw2 v1.2d, v7.2d, v1.4s
+; CHECK-GI-DOT-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <8 x i8> %x to <8 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@@ -4267,6 +5965,28 @@ define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-DOT-NEXT:    ushll v3.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-DOT-NEXT:    ushll v4.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-DOT-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-GI-DOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -4341,6 +6061,29 @@ define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v2.2d, v0.4s, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-GI-DOT-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-GI-DOT-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-GI-DOT-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    addp d0, v2.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v3.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <4 x i8> %x to <4 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
@@ -4384,6 +6127,20 @@ define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -4432,6 +6189,21 @@ define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_sext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-DOT-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-DOT-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    shl v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    sshr v0.2d, v0.2d, #56
+; CHECK-GI-DOT-NEXT:    sshr v1.2d, v1.2d, #56
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %xx = sext <2 x i8> %x to <2 x i64>
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -4497,6 +6269,30 @@ define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8
 ; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v5.2s, v0.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    sdot v6.2s, v3.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    udot v7.2s, v1.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    sdot v16.2s, v2.8b, v4.8b
+; CHECK-GI-DOT-NEXT:    addp v0.2s, v5.2s, v5.2s
+; CHECK-GI-DOT-NEXT:    addp v3.2s, v6.2s, v6.2s
+; CHECK-GI-DOT-NEXT:    addp v1.2s, v7.2s, v7.2s
+; CHECK-GI-DOT-NEXT:    addp v2.2s, v16.2s, v16.2s
+; CHECK-GI-DOT-NEXT:    fmov w8, s0
+; CHECK-GI-DOT-NEXT:    fmov w11, s3
+; CHECK-GI-DOT-NEXT:    fmov w9, s1
+; CHECK-GI-DOT-NEXT:    fmov w10, s2
+; CHECK-GI-DOT-NEXT:    add w8, w8, w9
+; CHECK-GI-DOT-NEXT:    add w9, w10, w11
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %axx = zext <8 x i8> %ax to <8 x i32>
   %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
@@ -4555,6 +6351,27 @@ define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i1
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    fmov w0, s0
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v6.4s, v2.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-GI-DOT-NEXT:    ushll v7.4s, v3.4h, #0
+; CHECK-GI-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-GI-DOT-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-GI-DOT-NEXT:    add v3.4s, v7.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-DOT-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
+; CHECK-GI-DOT-NEXT:    fmov w0, s0
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %axx = zext <8 x i16> %ax to <8 x i32>
   %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -4602,6 +6419,15 @@ define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-GI-BASE-NEXT:    fmov x9, d1
 ; CHECK-GI-BASE-NEXT:    add x0, x8, x9
 ; CHECK-GI-BASE-NEXT:    ret
+;
+; CHECK-GI-DOT-LABEL: add_pair_v2i64_v2i64:
+; CHECK-GI-DOT:       // %bb.0: // %entry
+; CHECK-GI-DOT-NEXT:    addp d0, v0.2d
+; CHECK-GI-DOT-NEXT:    addp d1, v1.2d
+; CHECK-GI-DOT-NEXT:    fmov x8, d0
+; CHECK-GI-DOT-NEXT:    fmov x9, d1
+; CHECK-GI-DOT-NEXT:    add x0, x8, x9
+; CHECK-GI-DOT-NEXT:    ret
 entry:
   %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
   %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
@@ -4664,61 +6490,61 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-BASE-NEXT:    fmov w0, s0
 ; CHECK-BASE-NEXT:    ret
 ;
-; CHECK-SD-DOT-LABEL: full:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    ldr d0, [x0]
-; CHECK-SD-DOT-NEXT:    ldr d1, [x2]
-; CHECK-SD-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-SD-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-SD-DOT-NEXT:    sxtw x8, w3
-; CHECK-SD-DOT-NEXT:    sxtw x9, w1
-; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
-; CHECK-SD-DOT-NEXT:    add x11, x2, x8
-; CHECK-SD-DOT-NEXT:    add x10, x0, x9
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
-; CHECK-SD-DOT-NEXT:    add x11, x11, x8
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
-; CHECK-SD-DOT-NEXT:    add x10, x10, x9
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
-; CHECK-SD-DOT-NEXT:    add x10, x10, x9
-; CHECK-SD-DOT-NEXT:    add x11, x11, x8
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
-; CHECK-SD-DOT-NEXT:    add x10, x10, x9
-; CHECK-SD-DOT-NEXT:    add x11, x11, x8
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
-; CHECK-SD-DOT-NEXT:    add x10, x10, x9
-; CHECK-SD-DOT-NEXT:    add x11, x11, x8
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
-; CHECK-SD-DOT-NEXT:    add x10, x10, x9
-; CHECK-SD-DOT-NEXT:    add x11, x11, x8
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    ldr d1, [x10, x9]
-; CHECK-SD-DOT-NEXT:    ldr d4, [x11, x8]
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
-; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
+; CHECK-DOT-LABEL: full:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ldr d0, [x0]
+; CHECK-DOT-NEXT:    ldr d1, [x2]
+; CHECK-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
+; CHECK-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-DOT-NEXT:    sxtw x8, w3
+; CHECK-DOT-NEXT:    sxtw x9, w1
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v3.8b, #1
+; CHECK-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
+; CHECK-DOT-NEXT:    add x11, x2, x8
+; CHECK-DOT-NEXT:    add x10, x0, x9
+; CHECK-DOT-NEXT:    ldr d4, [x11]
+; CHECK-DOT-NEXT:    add x11, x11, x8
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    add x10, x10, x9
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    ldr d4, [x11]
+; CHECK-DOT-NEXT:    add x10, x10, x9
+; CHECK-DOT-NEXT:    add x11, x11, x8
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    ldr d4, [x11]
+; CHECK-DOT-NEXT:    add x10, x10, x9
+; CHECK-DOT-NEXT:    add x11, x11, x8
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    ldr d4, [x11]
+; CHECK-DOT-NEXT:    add x10, x10, x9
+; CHECK-DOT-NEXT:    add x11, x11, x8
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    ldr d4, [x11]
+; CHECK-DOT-NEXT:    add x10, x10, x9
+; CHECK-DOT-NEXT:    add x11, x11, x8
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    ldr d4, [x11]
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10, x9]
+; CHECK-DOT-NEXT:    ldr d4, [x11, x8]
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    fmov w0, s0
+; CHECK-DOT-NEXT:    ret
 entry:
   %idx.ext8 = sext i32 %s2 to i64
   %idx.ext = sext i32 %s1 to i64



More information about the llvm-commits mailing list