[llvm-branch-commits] [llvm] [AArch64][GlobalISel] Avoid splitting loads of large vector types into individual element loads (PR #85042)

Dhruv Chawla via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Mar 13 04:10:16 PDT 2024


https://github.com/dc03-work updated https://github.com/llvm/llvm-project/pull/85042

>From ec953a06c9a3c9a29155bc07dfc3a1bdb033ee23 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruvc at nvidia.com>
Date: Wed, 13 Mar 2024 10:36:35 +0530
Subject: [PATCH] [AArch64][GlobalISel] Avoid splitting loads of large vector
 types into individual element loads

This patch adds custom legalization for G_LOAD where it splits loads of
fixed-width vector types larger than 128 bits into loads of 128-bit
vectors with the same element type.

This is an improvement to what was being done before where loads would
be split into individual loads for each element of the vector.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   10 +-
 .../GlobalISel/legalize-load-store.mir        |   41 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 1476 +++--------------
 3 files changed, 225 insertions(+), 1302 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fea9d4495f44c7..2ae2923dfb353e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -373,6 +373,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampMaxNumElements(0, s32, 4)
+      .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s8, s64)
       .narrowScalarIf(
@@ -383,11 +388,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    Query.Types[0].getSizeInBits() > 32;
           },
           changeTo(0, s32))
-      .clampMaxNumElements(0, s8, 16)
-      .clampMaxNumElements(0, s16, 8)
-      .clampMaxNumElements(0, s32, 4)
-      .clampMaxNumElements(0, s64, 2)
-      .clampMaxNumElements(0, p0, 2)
       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
       .bitcastIf(typeInSet(0, {v4s8}),
                  [=](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 5cbb8649d158b0..aa152aea81ff9c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -711,33 +711,24 @@ body:             |
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
-    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
-    ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
-    ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
-    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
-    ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
+    ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
     ; CHECK-NEXT: RET_ReallyLR
     %ptr:_(p0) = COPY $x0
     %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 94ab173e9183ac..66ef436f48c637 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2063,228 +2063,52 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v3.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umull v3.4s, v4.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v4.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    umull v4.4s, v5.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v5.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
 ; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    fmov w12, s4
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2352,449 +2176,91 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
-; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
-; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
-; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
-; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
-; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
-; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #27]
-; CHECK-GI-BASE-NEXT:    mov v6.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #35]
-; CHECK-GI-BASE-NEXT:    mov v17.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #43]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #4]
-; CHECK-GI-BASE-NEXT:    mov v19.b[3], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v21.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #28]
-; CHECK-GI-BASE-NEXT:    mov v6.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #36]
-; CHECK-GI-BASE-NEXT:    mov v17.b[4], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #5]
-; CHECK-GI-BASE-NEXT:    mov v19.b[4], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v21.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #29]
-; CHECK-GI-BASE-NEXT:    mov v6.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #37]
-; CHECK-GI-BASE-NEXT:    ldr b27, [x0, #30]
-; CHECK-GI-BASE-NEXT:    mov v17.b[5], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #45]
-; CHECK-GI-BASE-NEXT:    mov v19.b[5], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #6]
-; CHECK-GI-BASE-NEXT:    mov v20.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v27.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v21.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #30]
-; CHECK-GI-BASE-NEXT:    mov v6.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #38]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #31]
-; CHECK-GI-BASE-NEXT:    mov v17.b[6], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #46]
-; CHECK-GI-BASE-NEXT:    mov v19.b[6], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #7]
-; CHECK-GI-BASE-NEXT:    mov v20.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b28, [x0, #46]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #15]
-; CHECK-GI-BASE-NEXT:    mov v21.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #31]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-BASE-NEXT:    mov v6.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #39]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v28.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[7], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v19.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #47]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #47]
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v21.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v6.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.8h, v17.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v16.8h, v16.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v17.8h, v19.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v18.8h, v20.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v6.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v7.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v7.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    umull v7.4s, v16.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v16.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    umull v16.4s, v17.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    umull2 v3.4s, v17.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v19.8h, v21.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v17.4s, v18.4h, v4.4h
-; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v18.8h, v4.8h
-; CHECK-GI-BASE-NEXT:    addv s20, v20.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    umull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    umull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    umull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s6, v6.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
 ; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    umull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
 ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    umull v18.4s, v19.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v19.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    addv s16, v16.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s17, v17.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s20
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
-; CHECK-GI-BASE-NEXT:    fmov w10, s6
-; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w12, s7
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
-; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
-; CHECK-GI-BASE-NEXT:    fmov w14, s16
-; CHECK-GI-BASE-NEXT:    addv s0, v5.4s
-; CHECK-GI-BASE-NEXT:    fmov w15, s3
-; CHECK-GI-BASE-NEXT:    fmov w16, s17
-; CHECK-GI-BASE-NEXT:    fmov w17, s4
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
-; CHECK-GI-BASE-NEXT:    add w11, w14, w15
-; CHECK-GI-BASE-NEXT:    fmov w18, s18
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w10, w11, w16
-; CHECK-GI-BASE-NEXT:    fmov w11, s0
-; CHECK-GI-BASE-NEXT:    add w9, w10, w17
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w18, w11
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    udot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    udot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
@@ -2947,228 +2413,52 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v3.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smull v3.4s, v4.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v4.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    smull v4.4s, v5.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v5.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
 ; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    fmov w12, s4
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -3236,449 +2526,91 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
-; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
-; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
-; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
-; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
-; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
-; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #27]
-; CHECK-GI-BASE-NEXT:    mov v6.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #35]
-; CHECK-GI-BASE-NEXT:    mov v17.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #43]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #4]
-; CHECK-GI-BASE-NEXT:    mov v19.b[3], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v21.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #28]
-; CHECK-GI-BASE-NEXT:    mov v6.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #36]
-; CHECK-GI-BASE-NEXT:    mov v17.b[4], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #5]
-; CHECK-GI-BASE-NEXT:    mov v19.b[4], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v21.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #29]
-; CHECK-GI-BASE-NEXT:    mov v6.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #37]
-; CHECK-GI-BASE-NEXT:    ldr b27, [x0, #30]
-; CHECK-GI-BASE-NEXT:    mov v17.b[5], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #45]
-; CHECK-GI-BASE-NEXT:    mov v19.b[5], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #6]
-; CHECK-GI-BASE-NEXT:    mov v20.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v27.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v21.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #30]
-; CHECK-GI-BASE-NEXT:    mov v6.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #38]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #31]
-; CHECK-GI-BASE-NEXT:    mov v17.b[6], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #46]
-; CHECK-GI-BASE-NEXT:    mov v19.b[6], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #7]
-; CHECK-GI-BASE-NEXT:    mov v20.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b28, [x0, #46]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #15]
-; CHECK-GI-BASE-NEXT:    mov v21.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #31]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-BASE-NEXT:    mov v6.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #39]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v28.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[7], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v19.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #47]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #47]
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v21.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.8h, v17.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v17.8h, v19.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v18.8h, v20.8b, #0
-; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v6.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v7.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v7.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    smull v7.4s, v16.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v16.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    smull v16.4s, v17.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    smull2 v3.4s, v17.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v19.8h, v21.8b, #0
-; CHECK-GI-BASE-NEXT:    smull v17.4s, v18.4h, v4.4h
-; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v18.8h, v4.8h
-; CHECK-GI-BASE-NEXT:    addv s20, v20.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    smull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    smull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    smull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s6, v6.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
 ; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    smull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
 ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    smull v18.4s, v19.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v19.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    addv s16, v16.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s17, v17.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s20
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
-; CHECK-GI-BASE-NEXT:    fmov w10, s6
-; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w12, s7
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
-; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
-; CHECK-GI-BASE-NEXT:    fmov w14, s16
-; CHECK-GI-BASE-NEXT:    addv s0, v5.4s
-; CHECK-GI-BASE-NEXT:    fmov w15, s3
-; CHECK-GI-BASE-NEXT:    fmov w16, s17
-; CHECK-GI-BASE-NEXT:    fmov w17, s4
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
-; CHECK-GI-BASE-NEXT:    add w11, w14, w15
-; CHECK-GI-BASE-NEXT:    fmov w18, s18
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w10, w11, w16
-; CHECK-GI-BASE-NEXT:    fmov w11, s0
-; CHECK-GI-BASE-NEXT:    add w9, w10, w17
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w18, w11
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    sdot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    sdot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1



More information about the llvm-branch-commits mailing list