[llvm] [GlobalISel] Add support to moreElementsVector for G_SEXT, G_ZEXT and G_ANYEXT (PR #85038)

Dhruv Chawla via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 13 00:48:40 PDT 2024


https://github.com/dc03-work created https://github.com/llvm/llvm-project/pull/85038

None

>From e6c1c9fdbbb0a2e328e660d615de9ed2a765925f Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruvc at nvidia.com>
Date: Wed, 13 Mar 2024 13:18:29 +0530
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  20 +
 llvm/test/CodeGen/AArch64/fcmp.ll             |  81 +-
 llvm/test/CodeGen/AArch64/sext.ll             |  14 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 809 +++++++++++++++---
 llvm/test/CodeGen/AArch64/zext.ll             |  18 +-
 5 files changed, 779 insertions(+), 163 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bd3ff7265d51f9..a480c290907761 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5496,6 +5496,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return Legalized;
   }
 
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT: {
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    if (TypeIdx == 0) {
+      DstTy = MoreTy;
+      SrcTy = MoreTy.changeElementType(SrcTy.getElementType());
+    } else if (TypeIdx == 1) {
+      SrcTy = MoreTy;
+      DstTy = MoreTy.changeElementType(DstTy.getElementType());
+    }
+
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, SrcTy, 1);
+    moreElementsVectorDst(MI, DstTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
   default:
     return UnableToLegalize;
   }
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 2d0b5574cdd7ba..9916aeeab1cad1 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1108,61 +1108,54 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w12, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w12
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s1, w10
 ; CHECK-GI-FP16-NEXT:    fmov s6, w0
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
 ; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[4]
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
-; CHECK-GI-FP16-NEXT:    umov w11, v1.h[5]
-; CHECK-GI-FP16-NEXT:    umov w10, v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w12
-; CHECK-GI-FP16-NEXT:    umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s3, w9
-; CHECK-GI-FP16-NEXT:    fmov s0, w8
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w12
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmov s2, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w11
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s1, w10
-; CHECK-GI-FP16-NEXT:    neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    neg v17.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s3
+; CHECK-GI-FP16-NEXT:    fmov s3, w7
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s4
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    fmov s2, w4
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v17.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    fmov s4, w7
-; CHECK-GI-FP16-NEXT:    mov v0.s[2], w13
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #16]
-; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    fmov s3, w4
-; CHECK-GI-FP16-NEXT:    mov v0.s[3], w9
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w5
-; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v17.4s
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v3.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    eor v4.16b, v1.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w6
+; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
+; CHECK-GI-FP16-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 61f04fbf0484f7..3e0d5dd875097f 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -280,13 +280,12 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sxtb x8, w2
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
@@ -444,13 +443,12 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 66b49466cc7361..94ab173e9183ac 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4,11 +4,6 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
 
-; CHECK-GI-BASE:        warning: Instruction selection used fallback path for test_udot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_udot_v48i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v48i8
-
 define i32 @addv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: addv_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -2068,25 +2063,125 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
+; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
+; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
+; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
+; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
+; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
+; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
+; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
+; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
+; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
+; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
+; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
+; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v3.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v3.4s, v4.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v4.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v4.4s, v5.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v5.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s4
+; CHECK-GI-BASE-NEXT:    fmov w13, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
@@ -2257,39 +2352,245 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
+; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
+; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
+; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
+; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
+; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
+; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
+; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
+; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
+; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
+; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
+; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
+; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
+; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
+; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
+; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
+; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
+; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
+; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
+; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
+; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
+; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
+; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
+; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
+; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
+; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
+; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #27]
+; CHECK-GI-BASE-NEXT:    mov v6.b[3], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #35]
+; CHECK-GI-BASE-NEXT:    mov v17.b[3], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[3], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #43]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #4]
+; CHECK-GI-BASE-NEXT:    mov v19.b[3], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[3], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #12]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #20]
+; CHECK-GI-BASE-NEXT:    mov v21.b[3], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #28]
+; CHECK-GI-BASE-NEXT:    mov v6.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #36]
+; CHECK-GI-BASE-NEXT:    mov v17.b[4], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[4], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #44]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #5]
+; CHECK-GI-BASE-NEXT:    mov v19.b[4], v22.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #13]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #21]
+; CHECK-GI-BASE-NEXT:    mov v21.b[4], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #29]
+; CHECK-GI-BASE-NEXT:    mov v6.b[5], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #37]
+; CHECK-GI-BASE-NEXT:    ldr b27, [x0, #30]
+; CHECK-GI-BASE-NEXT:    mov v17.b[5], v22.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[5], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #45]
+; CHECK-GI-BASE-NEXT:    mov v19.b[5], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #6]
+; CHECK-GI-BASE-NEXT:    mov v20.b[5], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[6], v27.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #14]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #22]
+; CHECK-GI-BASE-NEXT:    mov v21.b[5], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #30]
+; CHECK-GI-BASE-NEXT:    mov v6.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #38]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #31]
+; CHECK-GI-BASE-NEXT:    mov v17.b[6], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #46]
+; CHECK-GI-BASE-NEXT:    mov v19.b[6], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #7]
+; CHECK-GI-BASE-NEXT:    mov v20.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b28, [x0, #46]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #15]
+; CHECK-GI-BASE-NEXT:    mov v21.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #23]
+; CHECK-GI-BASE-NEXT:    mov v3.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #31]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #39]
+; CHECK-GI-BASE-NEXT:    mov v6.b[7], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #39]
+; CHECK-GI-BASE-NEXT:    mov v5.b[6], v28.b[0]
+; CHECK-GI-BASE-NEXT:    mov v17.b[7], v22.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[7], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v19.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[7], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #47]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #47]
 ; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
 ; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    mov v5.b[7], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    mov v21.b[7], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v17.8h, v19.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v18.8h, v20.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v6.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v7.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v7.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v7.4s, v16.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v16.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    umull v16.4s, v17.4h, v3.4h
+; CHECK-GI-BASE-NEXT:    umull2 v3.4s, v17.8h, v3.8h
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v19.8h, v21.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v17.4s, v18.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v18.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    addv s20, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s6, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v19.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v19.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    addv s16, v16.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    addv s17, v17.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s20
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    fmov w10, s6
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    fmov w12, s7
+; CHECK-GI-BASE-NEXT:    fmov w13, s2
+; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
+; CHECK-GI-BASE-NEXT:    fmov w14, s16
+; CHECK-GI-BASE-NEXT:    addv s0, v5.4s
+; CHECK-GI-BASE-NEXT:    fmov w15, s3
+; CHECK-GI-BASE-NEXT:    fmov w16, s17
+; CHECK-GI-BASE-NEXT:    fmov w17, s4
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w11, w14, w15
+; CHECK-GI-BASE-NEXT:    fmov w18, s18
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w16
+; CHECK-GI-BASE-NEXT:    fmov w11, s0
+; CHECK-GI-BASE-NEXT:    add w9, w10, w17
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w18, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v48i8:
@@ -2646,25 +2947,125 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
+; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
+; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
+; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
+; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
+; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
+; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
+; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
+; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
+; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
+; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
+; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
+; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
+; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
+; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
+; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v3.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smull v3.4s, v4.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v4.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v4.4s, v5.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v5.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s4
+; CHECK-GI-BASE-NEXT:    fmov w13, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
@@ -2835,39 +3236,245 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
+; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
+; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
+; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
+; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
+; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
+; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
+; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
+; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
+; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
+; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
+; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
+; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
+; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
+; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
+; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
+; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
+; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
+; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
+; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
+; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
+; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
+; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
+; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
+; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
+; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
+; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
+; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
+; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #27]
+; CHECK-GI-BASE-NEXT:    mov v6.b[3], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #35]
+; CHECK-GI-BASE-NEXT:    mov v17.b[3], v25.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[3], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #43]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #4]
+; CHECK-GI-BASE-NEXT:    mov v19.b[3], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[3], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #12]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #20]
+; CHECK-GI-BASE-NEXT:    mov v21.b[3], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #28]
+; CHECK-GI-BASE-NEXT:    mov v6.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #36]
+; CHECK-GI-BASE-NEXT:    mov v17.b[4], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[4], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #44]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #5]
+; CHECK-GI-BASE-NEXT:    mov v19.b[4], v22.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[4], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #13]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #21]
+; CHECK-GI-BASE-NEXT:    mov v21.b[4], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #29]
+; CHECK-GI-BASE-NEXT:    mov v6.b[5], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #37]
+; CHECK-GI-BASE-NEXT:    ldr b27, [x0, #30]
+; CHECK-GI-BASE-NEXT:    mov v17.b[5], v22.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[5], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #45]
+; CHECK-GI-BASE-NEXT:    mov v19.b[5], v23.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #6]
+; CHECK-GI-BASE-NEXT:    mov v20.b[5], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v3.b[6], v27.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #14]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #22]
+; CHECK-GI-BASE-NEXT:    mov v21.b[5], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #30]
+; CHECK-GI-BASE-NEXT:    mov v6.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #38]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #31]
+; CHECK-GI-BASE-NEXT:    mov v17.b[6], v23.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #46]
+; CHECK-GI-BASE-NEXT:    mov v19.b[6], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #7]
+; CHECK-GI-BASE-NEXT:    mov v20.b[6], v22.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b28, [x0, #46]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #15]
+; CHECK-GI-BASE-NEXT:    mov v21.b[6], v24.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #23]
+; CHECK-GI-BASE-NEXT:    mov v3.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #31]
+; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #39]
+; CHECK-GI-BASE-NEXT:    mov v6.b[7], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #39]
+; CHECK-GI-BASE-NEXT:    mov v5.b[6], v28.b[0]
+; CHECK-GI-BASE-NEXT:    mov v17.b[7], v22.b[0]
+; CHECK-GI-BASE-NEXT:    mov v16.b[7], v24.b[0]
+; CHECK-GI-BASE-NEXT:    mov v19.b[7], v7.b[0]
+; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
+; CHECK-GI-BASE-NEXT:    mov v20.b[7], v25.b[0]
+; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #47]
+; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #47]
 ; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
 ; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    mov v5.b[7], v23.b[0]
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    mov v21.b[7], v22.b[0]
+; CHECK-GI-BASE-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v17.8h, v19.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v18.8h, v20.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v6.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v7.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v7.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v7.4s, v16.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v16.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    smull v16.4s, v17.4h, v3.4h
+; CHECK-GI-BASE-NEXT:    smull2 v3.4s, v17.8h, v3.8h
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v19.8h, v21.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v17.4s, v18.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v18.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    addv s20, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s6, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v19.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v19.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    addv s16, v16.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    addv s17, v17.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s20
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    fmov w10, s6
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    fmov w12, s7
+; CHECK-GI-BASE-NEXT:    fmov w13, s2
+; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
+; CHECK-GI-BASE-NEXT:    fmov w14, s16
+; CHECK-GI-BASE-NEXT:    addv s0, v5.4s
+; CHECK-GI-BASE-NEXT:    fmov w15, s3
+; CHECK-GI-BASE-NEXT:    fmov w16, s17
+; CHECK-GI-BASE-NEXT:    fmov w17, s4
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w11, w14, w15
+; CHECK-GI-BASE-NEXT:    fmov w18, s18
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w16
+; CHECK-GI-BASE-NEXT:    fmov w11, s0
+; CHECK-GI-BASE-NEXT:    add w9, w10, w17
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w18, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 54b29be2132cdd..716d2398996be2 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -305,15 +305,14 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d1, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    movi v0.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    and x8, x2, #0xff
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v1.d[1], x1
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -470,15 +469,14 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI27_0]
 ; CHECK-GI-NEXT:    and x8, x2, #0x3ff
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0



More information about the llvm-commits mailing list