[llvm] 208a985 - [AArch64][GlobalISel] Avoid splitting loads of large vector types into individual element loads (#85042)

Dhruv Chawla via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 17 20:02:31 PDT 2024


Author: Dhruv Chawla
Date: 2024-03-18T08:32:17+05:30
New Revision: 208a9850e6a4b64ad6311361735d27a9c6cbd8ec

URL: https://github.com/llvm/llvm-project/commit/208a9850e6a4b64ad6311361735d27a9c6cbd8ec
DIFF: https://github.com/llvm/llvm-project/commit/208a9850e6a4b64ad6311361735d27a9c6cbd8ec.diff

LOG: [AArch64][GlobalISel] Avoid splitting loads of large vector types into individual element loads (#85042)

This patch fixes an issue with the legalization of G_LOAD where the
presence of .lowerIfMemSizeNotByteSizePow2 before .clampMaxNumElements
was causing issues for vectors which matched that condition.

Such vectors would be lowered into per-element loads instead of being
split up into 128-bit chunks.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
    llvm/test/CodeGen/AArch64/vecreduce-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index fea9d4495f44c7..2ae2923dfb353e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -373,6 +373,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampMaxNumElements(0, s32, 4)
+      .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s8, s64)
       .narrowScalarIf(
@@ -383,11 +388,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    Query.Types[0].getSizeInBits() > 32;
           },
           changeTo(0, s32))
-      .clampMaxNumElements(0, s8, 16)
-      .clampMaxNumElements(0, s16, 8)
-      .clampMaxNumElements(0, s32, 4)
-      .clampMaxNumElements(0, s64, 2)
-      .clampMaxNumElements(0, p0, 2)
       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
       .bitcastIf(typeInSet(0, {v4s8}),
                  [=](const LegalityQuery &Query) {

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 5cbb8649d158b0..aa152aea81ff9c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -711,33 +711,24 @@ body:             |
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
-    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
-    ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
-    ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
-    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
-    ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
+    ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
     ; CHECK-NEXT: RET_ReallyLR
     %ptr:_(p0) = COPY $x0
     %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 94ab173e9183ac..66ef436f48c637 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2063,228 +2063,52 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v3.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umull v3.4s, v4.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v4.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    umull v4.4s, v5.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v5.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
 ; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    fmov w12, s4
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2352,449 +2176,91 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
-; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
-; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
-; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
-; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
-; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
-; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #27]
-; CHECK-GI-BASE-NEXT:    mov v6.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #35]
-; CHECK-GI-BASE-NEXT:    mov v17.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #43]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #4]
-; CHECK-GI-BASE-NEXT:    mov v19.b[3], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v21.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #28]
-; CHECK-GI-BASE-NEXT:    mov v6.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #36]
-; CHECK-GI-BASE-NEXT:    mov v17.b[4], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #5]
-; CHECK-GI-BASE-NEXT:    mov v19.b[4], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v21.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #29]
-; CHECK-GI-BASE-NEXT:    mov v6.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #37]
-; CHECK-GI-BASE-NEXT:    ldr b27, [x0, #30]
-; CHECK-GI-BASE-NEXT:    mov v17.b[5], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #45]
-; CHECK-GI-BASE-NEXT:    mov v19.b[5], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #6]
-; CHECK-GI-BASE-NEXT:    mov v20.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v27.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v21.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #30]
-; CHECK-GI-BASE-NEXT:    mov v6.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #38]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #31]
-; CHECK-GI-BASE-NEXT:    mov v17.b[6], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #46]
-; CHECK-GI-BASE-NEXT:    mov v19.b[6], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #7]
-; CHECK-GI-BASE-NEXT:    mov v20.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b28, [x0, #46]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #15]
-; CHECK-GI-BASE-NEXT:    mov v21.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #31]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-BASE-NEXT:    mov v6.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #39]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v28.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[7], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v19.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #47]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #47]
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v21.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v6.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.8h, v17.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v16.8h, v16.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v17.8h, v19.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v18.8h, v20.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v6.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v7.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v7.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    umull v7.4s, v16.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v16.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    umull v16.4s, v17.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    umull2 v3.4s, v17.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v19.8h, v21.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v17.4s, v18.4h, v4.4h
-; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v18.8h, v4.8h
-; CHECK-GI-BASE-NEXT:    addv s20, v20.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    umull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    umull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    umull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s6, v6.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
 ; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    umull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
 ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    umull v18.4s, v19.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v19.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    addv s16, v16.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s17, v17.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s20
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
-; CHECK-GI-BASE-NEXT:    fmov w10, s6
-; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w12, s7
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
-; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
-; CHECK-GI-BASE-NEXT:    fmov w14, s16
-; CHECK-GI-BASE-NEXT:    addv s0, v5.4s
-; CHECK-GI-BASE-NEXT:    fmov w15, s3
-; CHECK-GI-BASE-NEXT:    fmov w16, s17
-; CHECK-GI-BASE-NEXT:    fmov w17, s4
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
-; CHECK-GI-BASE-NEXT:    add w11, w14, w15
-; CHECK-GI-BASE-NEXT:    fmov w18, s18
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w10, w11, w16
-; CHECK-GI-BASE-NEXT:    fmov w11, s0
-; CHECK-GI-BASE-NEXT:    add w9, w10, w17
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w18, w11
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    udot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    udot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
@@ -2947,228 +2413,52 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v3.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smull v3.4s, v4.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v4.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    smull v4.4s, v5.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v5.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
 ; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    fmov w12, s4
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -3236,449 +2526,91 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #10]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #33]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #18]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x0, #24]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #25]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #34]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v5.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x0, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #41]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #11]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #42]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #12]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #26]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #13]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v21.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #27]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #43]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x0, #28]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #7]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #36]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #37]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #15]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x0, #29]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v20.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #45]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x0, #38]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x0, #23]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #2]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v26.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b26, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #8]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #24]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #25]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v20.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b20, [x1, #32]
-; CHECK-GI-BASE-NEXT:    mov v6.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #33]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v21.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[1], v26.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b21, [x1, #40]
-; CHECK-GI-BASE-NEXT:    mov v16.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #41]
-; CHECK-GI-BASE-NEXT:    mov v19.b[1], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[1], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #10]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v21.b[1], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #26]
-; CHECK-GI-BASE-NEXT:    mov v6.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #34]
-; CHECK-GI-BASE-NEXT:    mov v17.b[2], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #42]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #3]
-; CHECK-GI-BASE-NEXT:    mov v19.b[2], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[2], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v21.b[2], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #27]
-; CHECK-GI-BASE-NEXT:    mov v6.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #35]
-; CHECK-GI-BASE-NEXT:    mov v17.b[3], v25.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #43]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #4]
-; CHECK-GI-BASE-NEXT:    mov v19.b[3], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[3], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v21.b[3], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #28]
-; CHECK-GI-BASE-NEXT:    mov v6.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #36]
-; CHECK-GI-BASE-NEXT:    mov v17.b[4], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #44]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #5]
-; CHECK-GI-BASE-NEXT:    mov v19.b[4], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[4], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v21.b[4], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #29]
-; CHECK-GI-BASE-NEXT:    mov v6.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #37]
-; CHECK-GI-BASE-NEXT:    ldr b27, [x0, #30]
-; CHECK-GI-BASE-NEXT:    mov v17.b[5], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #45]
-; CHECK-GI-BASE-NEXT:    mov v19.b[5], v23.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #6]
-; CHECK-GI-BASE-NEXT:    mov v20.b[5], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v27.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v21.b[5], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #30]
-; CHECK-GI-BASE-NEXT:    mov v6.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #38]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #31]
-; CHECK-GI-BASE-NEXT:    mov v17.b[6], v23.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #46]
-; CHECK-GI-BASE-NEXT:    mov v19.b[6], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #7]
-; CHECK-GI-BASE-NEXT:    mov v20.b[6], v22.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b28, [x0, #46]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #15]
-; CHECK-GI-BASE-NEXT:    mov v21.b[6], v24.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b24, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #31]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-BASE-NEXT:    mov v6.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b25, [x1, #39]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v28.b[0]
-; CHECK-GI-BASE-NEXT:    mov v17.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    mov v16.b[7], v24.b[0]
-; CHECK-GI-BASE-NEXT:    mov v19.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v20.b[7], v25.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b23, [x0, #47]
-; CHECK-GI-BASE-NEXT:    ldr b22, [x1, #47]
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v23.b[0]
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    mov v21.b[7], v22.b[0]
-; CHECK-GI-BASE-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.8h, v17.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v17.8h, v19.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v18.8h, v20.8b, #0
-; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v6.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v7.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v7.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    smull v7.4s, v16.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v16.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    smull v16.4s, v17.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    smull2 v3.4s, v17.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v19.8h, v21.8b, #0
-; CHECK-GI-BASE-NEXT:    smull v17.4s, v18.4h, v4.4h
-; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v18.8h, v4.8h
-; CHECK-GI-BASE-NEXT:    addv s20, v20.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    smull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    smull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    smull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s6, v6.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
 ; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    smull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
 ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    smull v18.4s, v19.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v19.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    addv s16, v16.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s17, v17.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s20
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
-; CHECK-GI-BASE-NEXT:    fmov w10, s6
-; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w12, s7
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
-; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
-; CHECK-GI-BASE-NEXT:    fmov w14, s16
-; CHECK-GI-BASE-NEXT:    addv s0, v5.4s
-; CHECK-GI-BASE-NEXT:    fmov w15, s3
-; CHECK-GI-BASE-NEXT:    fmov w16, s17
-; CHECK-GI-BASE-NEXT:    fmov w17, s4
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
-; CHECK-GI-BASE-NEXT:    add w11, w14, w15
-; CHECK-GI-BASE-NEXT:    fmov w18, s18
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w10, w11, w16
-; CHECK-GI-BASE-NEXT:    fmov w11, s0
-; CHECK-GI-BASE-NEXT:    add w9, w10, w17
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w9, w18, w11
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    sdot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    sdot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1


        


More information about the llvm-commits mailing list