[llvm] [GlobalISel] Widen vector loads from aligned ptrs (PR #144309)

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 16 00:39:32 PDT 2025


https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/144309

If the pointer is aligned to more than the size of the vector, we can widen the load up to next power of 2 size, as SDAG performs.

Some of the v3 tests are currently worse - those should be addressed in other issues.

>From 94b23cb70a5305c4794452d50b342d421bbfca62 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 16 Jun 2025 08:36:36 +0100
Subject: [PATCH] [GlobalISel] Widen vector loads from aligned ptrs

If the pointer is aligned to more than the size of the vector, we can widen the
load up to next power of 2 size, as SDAG performs.

Some of the v3 tests are currently worse - those should be addressed in other
issues.
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  13 +
 llvm/test/CodeGen/AArch64/add.ll              |  30 +-
 llvm/test/CodeGen/AArch64/andorxor.ll         |  90 +--
 llvm/test/CodeGen/AArch64/ctlz.ll             |  18 +-
 llvm/test/CodeGen/AArch64/ctpop.ll            |  18 +-
 llvm/test/CodeGen/AArch64/cttz.ll             |  38 +-
 llvm/test/CodeGen/AArch64/load.ll             |  98 +--
 llvm/test/CodeGen/AArch64/mul.ll              |  30 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   | 739 +++++++++---------
 llvm/test/CodeGen/AArch64/sub.ll              |  30 +-
 10 files changed, 550 insertions(+), 554 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 028bffd1bf5a7..65cfa722dbd72 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4072,6 +4072,19 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
     if (MemTy != DstTy)
       return UnableToLegalize;
 
+    Align Alignment = LoadMI.getAlign();
+    if (Alignment.value() * 8 > MemSizeInBits &&
+        isPowerOf2_64(DstTy.getScalarSizeInBits())) {
+      LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
+                                     DstTy.getElementType());
+      MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
+      auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
+      MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
+                                                   NewLoad.getReg(0));
+      LoadMI.eraseFromParent();
+      return Legalized;
+    }
+
     // TODO: We can do better than scalarizing the vector and at least split it
     // in half.
     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index d5bd1b712a2a6..96168cb80196f 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index f7df1092287bd..a7875dbebd0e6 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -302,16 +302,20 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: and_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
@@ -350,16 +354,20 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: or_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
@@ -398,16 +406,20 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: xor_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
@@ -805,16 +817,10 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: and_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
@@ -842,16 +848,10 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: or_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
@@ -879,16 +879,10 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: xor_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index b1c6e24c30a7d..04124609eec74 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -56,12 +56,16 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    add x9, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-GI-NEXT:    clz v0.8b, v0.8b
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v0.b[0]
+; CHECK-GI-NEXT:    clz v0.8b, v2.8b
 ; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
 ; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    add x8, x0, #2
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    clz v0.4h, v0.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 55f75b6bc3f27..c739be95cd243 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -55,12 +55,16 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    add x9, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v0.b[0]
+; CHECK-GI-NEXT:    cnt v0.8b, v2.8b
 ; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
 ; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    add x8, x0, #2
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index 93ac97e20dabd..fc9bf2c0aca65 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -68,21 +68,23 @@ define void @v3i8(ptr %p1) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w9, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x0]
 ; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    mov v1.h[1], w10
-; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.h[2], w8
 ; CHECK-GI-NEXT:    add x8, x0, #1
-; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov b1, v0.b[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    add x9, x0, #2
-; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT:    eor v1.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
@@ -275,22 +277,20 @@ define void @v3i16(ptr %p1) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    add x9, x0, #4
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    add x10, x0, #4
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    add x8, x0, #2
 ; CHECK-GI-NEXT:    eor v2.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
 ; CHECK-GI-NEXT:    and v0.8b, v2.8b, v0.8b
 ; CHECK-GI-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GI-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
-; CHECK-GI-NEXT:    st1 { v0.h }[1], [x9]
-; CHECK-GI-NEXT:    st1 { v0.h }[2], [x10]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = load <3 x i16>, ptr %p1
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 6b26ae98a4ed8..c4bb6e37d6eaf 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -335,102 +335,50 @@ define <3 x i8> @load_v3i8(ptr %ptr) {
 ;
 ; CHECK-GI-LABEL: load_v3i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
-; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
 ; CHECK-GI-NEXT:    ret
   %a = load <3 x i8>, ptr %ptr
   ret <3 x i8> %a
 }
 
 define <7 x i8> @load_v7i8(ptr %ptr) {
-; CHECK-SD-LABEL: load_v7i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v7i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #1
-; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
-; CHECK-GI-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.b }[2], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #3
-; CHECK-GI-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #5
-; CHECK-GI-NEXT:    ld1 { v0.b }[5], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #6
-; CHECK-GI-NEXT:    ld1 { v0.b }[6], [x8]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
   %a = load <7 x i8>, ptr %ptr
   ret <7 x i8> %a
 }
 
 define <3 x i16> @load_v3i16(ptr %ptr) {
-; CHECK-SD-LABEL: load_v3i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v3i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
   %a = load <3 x i16>, ptr %ptr
   ret <3 x i16> %a
 }
 
 define <7 x i16> @load_v7i16(ptr %ptr) {
-; CHECK-SD-LABEL: load_v7i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v7i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #6
-; CHECK-GI-NEXT:    ld1 { v0.h }[3], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v0.h }[4], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #10
-; CHECK-GI-NEXT:    ld1 { v0.h }[5], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #12
-; CHECK-GI-NEXT:    ld1 { v0.h }[6], [x8]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
   %a = load <7 x i16>, ptr %ptr
   ret <7 x i16> %a
 }
 
 define <3 x i32> @load_v3i32(ptr %ptr) {
-; CHECK-SD-LABEL: load_v3i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: load_v3i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    add x8, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.s }[1], [x8]
-; CHECK-GI-NEXT:    add x8, x0, #8
-; CHECK-GI-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: load_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
   %a = load <3 x i32>, ptr %ptr
   ret <3 x i32> %a
 }
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 1558043f7f40a..9c69a6f03b858 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -122,16 +122,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
@@ -282,16 +286,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index a534112b7c559..4f0c4080aa0ce 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -412,31 +412,33 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ;
 ; CHECK-GI-LABEL: test_udot_v5i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0, #4]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #4]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
-; CHECK-GI-NEXT:    ldrb w11, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w12, [x1, #1]
-; CHECK-GI-NEXT:    mul w8, w9, w8
-; CHECK-GI-NEXT:    ldrb w9, [x0]
-; CHECK-GI-NEXT:    fmov s0, w10
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w12
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v0.s[2], w9
-; CHECK-GI-NEXT:    ldrb w9, [x1, #3]
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    ldrb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v0.s[3], w9
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    umov w8, v1.b[4]
+; CHECK-GI-NEXT:    umov w9, v0.b[4]
+; CHECK-GI-NEXT:    umov w10, v1.b[0]
+; CHECK-GI-NEXT:    umov w12, v0.b[0]
+; CHECK-GI-NEXT:    umov w11, v1.b[1]
+; CHECK-GI-NEXT:    umov w13, v0.b[1]
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    umov w9, v1.b[2]
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    umov w10, v1.b[3]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.s[1], w13
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    mov v4.s[1], wzr
+; CHECK-GI-NEXT:    mov v2.s[2], w9
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov v4.s[2], wzr
+; CHECK-GI-NEXT:    mov v2.s[3], w10
+; CHECK-GI-NEXT:    mov v3.s[3], w11
+; CHECK-GI-NEXT:    mov v4.s[3], wzr
+; CHECK-GI-NEXT:    mla v4.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v4.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
 ; CHECK-GI-NEXT:    ret
@@ -466,20 +468,21 @@ define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
 ;
 ; CHECK-GI-LABEL: test_udot_v5i8_nomla:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #4]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.s[1], w10
-; CHECK-GI-NEXT:    mov v1.s[1], wzr
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[2], wzr
-; CHECK-GI-NEXT:    ldrb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v1.s[3], wzr
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[4]
+; CHECK-GI-NEXT:    umov w10, v0.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    umov w8, v0.b[2]
+; CHECK-GI-NEXT:    umov w9, v0.b[3]
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], wzr
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v2.s[2], wzr
+; CHECK-GI-NEXT:    mov v1.s[3], w9
+; CHECK-GI-NEXT:    mov v2.s[3], wzr
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
@@ -506,31 +509,33 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
 ;
 ; CHECK-GI-LABEL: test_sdot_v5i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrsb w8, [x0, #4]
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #4]
-; CHECK-GI-NEXT:    ldrsb w10, [x1]
-; CHECK-GI-NEXT:    ldrsb w11, [x0, #1]
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #1]
-; CHECK-GI-NEXT:    mul w8, w9, w8
-; CHECK-GI-NEXT:    ldrsb w9, [x0]
-; CHECK-GI-NEXT:    fmov s0, w10
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #2]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w12
-; CHECK-GI-NEXT:    ldrsb w8, [x0, #2]
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v0.s[2], w9
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #3]
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    ldrsb w8, [x0, #3]
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v0.s[3], w9
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    smov w8, v1.b[4]
+; CHECK-GI-NEXT:    smov w9, v0.b[4]
+; CHECK-GI-NEXT:    smov w10, v1.b[0]
+; CHECK-GI-NEXT:    smov w12, v0.b[0]
+; CHECK-GI-NEXT:    smov w11, v1.b[1]
+; CHECK-GI-NEXT:    smov w13, v0.b[1]
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    smov w9, v1.b[2]
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    smov w10, v1.b[3]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    smov w8, v0.b[2]
+; CHECK-GI-NEXT:    mov v3.s[1], w13
+; CHECK-GI-NEXT:    smov w11, v0.b[3]
+; CHECK-GI-NEXT:    mov v4.s[1], wzr
+; CHECK-GI-NEXT:    mov v2.s[2], w9
+; CHECK-GI-NEXT:    mov v3.s[2], w8
+; CHECK-GI-NEXT:    mov v4.s[2], wzr
+; CHECK-GI-NEXT:    mov v2.s[3], w10
+; CHECK-GI-NEXT:    mov v3.s[3], w11
+; CHECK-GI-NEXT:    mov v4.s[3], wzr
+; CHECK-GI-NEXT:    mla v4.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v4.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
 ; CHECK-GI-NEXT:    ret
@@ -2298,128 +2303,145 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ;
 ; CHECK-GI-LABEL: test_udot_v25i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #16]!
-; CHECK-GI-NEXT:    ldrb w11, [x1, #4]
-; CHECK-GI-NEXT:    ldrb w12, [x1, #5]
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    ldp q1, q7, [x1]
 ; CHECK-GI-NEXT:    fmov s0, wzr
-; CHECK-GI-NEXT:    umov w13, v2.b[4]
-; CHECK-GI-NEXT:    umov w14, v2.b[5]
-; CHECK-GI-NEXT:    umov w10, v2.b[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    umov w9, v2.b[8]
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    umov w11, v2.b[12]
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldrb w8, [x1, #1]
+; CHECK-GI-NEXT:    ldp q16, q3, [x0]
+; CHECK-GI-NEXT:    umov w9, v1.b[4]
+; CHECK-GI-NEXT:    umov w11, v1.b[5]
+; CHECK-GI-NEXT:    umov w18, v1.b[0]
+; CHECK-GI-NEXT:    umov w0, v1.b[12]
+; CHECK-GI-NEXT:    umov w3, v7.b[4]
+; CHECK-GI-NEXT:    umov w12, v1.b[1]
+; CHECK-GI-NEXT:    umov w13, v1.b[6]
+; CHECK-GI-NEXT:    umov w1, v1.b[13]
+; CHECK-GI-NEXT:    umov w4, v7.b[5]
+; CHECK-GI-NEXT:    umov w15, v1.b[2]
+; CHECK-GI-NEXT:    umov w8, v1.b[3]
+; CHECK-GI-NEXT:    umov w16, v1.b[7]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    umov w14, v1.b[8]
+; CHECK-GI-NEXT:    umov w17, v1.b[9]
+; CHECK-GI-NEXT:    umov w10, v1.b[10]
+; CHECK-GI-NEXT:    umov w9, v1.b[11]
+; CHECK-GI-NEXT:    umov w5, v1.b[14]
+; CHECK-GI-NEXT:    umov w6, v7.b[0]
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w3
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    umov w11, v1.b[15]
+; CHECK-GI-NEXT:    fmov s1, w18
+; CHECK-GI-NEXT:    umov w7, v7.b[1]
+; CHECK-GI-NEXT:    umov w18, v7.b[6]
+; CHECK-GI-NEXT:    umov w21, v16.b[4]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w4
+; CHECK-GI-NEXT:    fmov s6, w14
+; CHECK-GI-NEXT:    mov v1.s[1], w12
+; CHECK-GI-NEXT:    umov w12, v7.b[3]
+; CHECK-GI-NEXT:    umov w14, v7.b[7]
+; CHECK-GI-NEXT:    mov v2.s[2], w13
+; CHECK-GI-NEXT:    umov w13, v7.b[2]
+; CHECK-GI-NEXT:    umov w0, v7.b[8]
+; CHECK-GI-NEXT:    fmov s7, w6
+; CHECK-GI-NEXT:    umov w23, v16.b[12]
+; CHECK-GI-NEXT:    umov w25, v3.b[4]
+; CHECK-GI-NEXT:    mov v6.s[1], w17
+; CHECK-GI-NEXT:    mov v4.s[2], w5
+; CHECK-GI-NEXT:    mov v5.s[2], w18
+; CHECK-GI-NEXT:    mov v1.s[2], w15
+; CHECK-GI-NEXT:    umov w6, v16.b[0]
+; CHECK-GI-NEXT:    umov w3, v16.b[1]
+; CHECK-GI-NEXT:    mov v2.s[3], w16
+; CHECK-GI-NEXT:    mov v7.s[1], w7
+; CHECK-GI-NEXT:    umov w16, v16.b[2]
+; CHECK-GI-NEXT:    umov w15, v16.b[3]
+; CHECK-GI-NEXT:    umov w22, v16.b[5]
+; CHECK-GI-NEXT:    umov w5, v16.b[6]
+; CHECK-GI-NEXT:    umov w18, v16.b[7]
+; CHECK-GI-NEXT:    umov w19, v16.b[8]
+; CHECK-GI-NEXT:    umov w7, v16.b[9]
+; CHECK-GI-NEXT:    umov w24, v16.b[13]
+; CHECK-GI-NEXT:    umov w1, v16.b[10]
+; CHECK-GI-NEXT:    umov w17, v16.b[11]
+; CHECK-GI-NEXT:    umov w20, v16.b[14]
+; CHECK-GI-NEXT:    umov w4, v16.b[15]
+; CHECK-GI-NEXT:    fmov s16, w21
+; CHECK-GI-NEXT:    umov w21, v3.b[8]
+; CHECK-GI-NEXT:    umov w26, v3.b[5]
+; CHECK-GI-NEXT:    fmov s17, w23
+; CHECK-GI-NEXT:    umov w23, v3.b[0]
+; CHECK-GI-NEXT:    fmov s18, w25
+; CHECK-GI-NEXT:    umov w25, v3.b[3]
+; CHECK-GI-NEXT:    mov v16.s[1], w22
+; CHECK-GI-NEXT:    umov w22, v3.b[1]
+; CHECK-GI-NEXT:    fmov s19, w6
+; CHECK-GI-NEXT:    mov v17.s[1], w24
+; CHECK-GI-NEXT:    umov w24, v3.b[2]
+; CHECK-GI-NEXT:    umov w6, v3.b[7]
+; CHECK-GI-NEXT:    mul w0, w0, w21
+; CHECK-GI-NEXT:    mov v18.s[1], w26
+; CHECK-GI-NEXT:    umov w26, v3.b[6]
+; CHECK-GI-NEXT:    fmov s3, w19
+; CHECK-GI-NEXT:    fmov s20, w23
+; CHECK-GI-NEXT:    mov v19.s[1], w3
+; CHECK-GI-NEXT:    mov v16.s[2], w5
 ; CHECK-GI-NEXT:    mov v0.s[1], wzr
-; CHECK-GI-NEXT:    fmov s7, w13
-; CHECK-GI-NEXT:    fmov s4, w10
-; CHECK-GI-NEXT:    umov w10, v2.b[13]
-; CHECK-GI-NEXT:    mov v5.s[1], w12
-; CHECK-GI-NEXT:    umov w13, v2.b[9]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    fmov s16, w11
-; CHECK-GI-NEXT:    umov w9, v1.b[0]
-; CHECK-GI-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NEXT:    mov v7.s[1], w14
-; CHECK-GI-NEXT:    umov w14, v2.b[6]
-; CHECK-GI-NEXT:    ldrb w12, [x1, #6]
-; CHECK-GI-NEXT:    umov w8, v2.b[1]
-; CHECK-GI-NEXT:    umov w11, v2.b[2]
-; CHECK-GI-NEXT:    mov v0.s[2], wzr
-; CHECK-GI-NEXT:    mov v16.s[1], w10
-; CHECK-GI-NEXT:    umov w10, v2.b[14]
-; CHECK-GI-NEXT:    mov v5.s[2], w12
-; CHECK-GI-NEXT:    umov w12, v1.b[5]
-; CHECK-GI-NEXT:    mov v6.s[1], w13
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    mov v7.s[2], w14
-; CHECK-GI-NEXT:    umov w14, v1.b[4]
-; CHECK-GI-NEXT:    umov w9, v2.b[10]
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[1]
-; CHECK-GI-NEXT:    umov w13, v2.b[7]
-; CHECK-GI-NEXT:    mov v16.s[2], w10
-; CHECK-GI-NEXT:    umov w10, v2.b[15]
-; CHECK-GI-NEXT:    mov v0.s[3], wzr
-; CHECK-GI-NEXT:    fmov s18, w14
-; CHECK-GI-NEXT:    mov v6.s[2], w9
-; CHECK-GI-NEXT:    umov w9, v1.b[12]
-; CHECK-GI-NEXT:    mov v4.s[2], w11
-; CHECK-GI-NEXT:    ldrb w11, [x1, #7]
-; CHECK-GI-NEXT:    mov v17.s[1], w8
-; CHECK-GI-NEXT:    ldrb w8, [x1, #2]
-; CHECK-GI-NEXT:    mov v16.s[3], w10
-; CHECK-GI-NEXT:    umov w10, v1.b[13]
-; CHECK-GI-NEXT:    mov v18.s[1], w12
-; CHECK-GI-NEXT:    umov w12, v1.b[6]
-; CHECK-GI-NEXT:    mov v5.s[3], w11
-; CHECK-GI-NEXT:    ldrb w11, [x0, #16]!
-; CHECK-GI-NEXT:    mov v7.s[3], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[2]
-; CHECK-GI-NEXT:    fmov s20, w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #5]
-; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[8]
-; CHECK-GI-NEXT:    fmov s22, w11
-; CHECK-GI-NEXT:    mov v18.s[2], w12
-; CHECK-GI-NEXT:    ldrb w12, [x0, #4]
-; CHECK-GI-NEXT:    umov w11, v2.b[3]
-; CHECK-GI-NEXT:    mov v20.s[1], w10
-; CHECK-GI-NEXT:    ldrb w10, [x0, #8]
-; CHECK-GI-NEXT:    fmov s21, w12
-; CHECK-GI-NEXT:    ldrb w12, [x1, #8]
-; CHECK-GI-NEXT:    mov v17.s[2], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[9]
-; CHECK-GI-NEXT:    fmov s19, w8
-; CHECK-GI-NEXT:    umov w8, v1.b[14]
-; CHECK-GI-NEXT:    mul w10, w12, w10
-; CHECK-GI-NEXT:    umov w12, v1.b[7]
+; CHECK-GI-NEXT:    mov v6.s[2], w10
+; CHECK-GI-NEXT:    fmov s21, w0
+; CHECK-GI-NEXT:    mov v17.s[2], w20
 ; CHECK-GI-NEXT:    mov v4.s[3], w11
-; CHECK-GI-NEXT:    mov v21.s[1], w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #6]
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    ldrb w13, [x0, #1]
-; CHECK-GI-NEXT:    mov v20.s[2], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[10]
-; CHECK-GI-NEXT:    mov v18.s[3], w12
-; CHECK-GI-NEXT:    ldrb w12, [x0, #7]
-; CHECK-GI-NEXT:    mov v21.s[2], w9
-; CHECK-GI-NEXT:    umov w9, v2.b[11]
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    ldrb w10, [x0, #2]
-; CHECK-GI-NEXT:    mov v22.s[1], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[15]
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v19.s[2], w8
-; CHECK-GI-NEXT:    umov w8, v1.b[3]
-; CHECK-GI-NEXT:    mov v21.s[3], w12
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.s[1], w7
+; CHECK-GI-NEXT:    mov v20.s[1], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[2], w26
+; CHECK-GI-NEXT:    mov v21.s[1], wzr
+; CHECK-GI-NEXT:    mov v16.s[3], w18
+; CHECK-GI-NEXT:    mov v17.s[3], w4
+; CHECK-GI-NEXT:    mov v7.s[2], w13
+; CHECK-GI-NEXT:    mov v5.s[3], w14
+; CHECK-GI-NEXT:    mov v19.s[2], w16
+; CHECK-GI-NEXT:    mov v3.s[2], w1
+; CHECK-GI-NEXT:    mov v0.s[2], wzr
+; CHECK-GI-NEXT:    mov v20.s[2], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[3], w6
+; CHECK-GI-NEXT:    mov v21.s[2], wzr
+; CHECK-GI-NEXT:    mul v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v6.s[3], w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #3]
-; CHECK-GI-NEXT:    mov v20.s[3], w13
-; CHECK-GI-NEXT:    umov w13, v1.b[11]
-; CHECK-GI-NEXT:    mov v22.s[2], w10
-; CHECK-GI-NEXT:    ldrb w10, [x1, #3]
-; CHECK-GI-NEXT:    mul v1.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v17.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w10
-; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v21.4s
-; CHECK-GI-NEXT:    mov v19.s[3], w13
-; CHECK-GI-NEXT:    mul v7.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT:    mov v22.s[3], w9
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    mla v7.4s, v6.4s, v19.4s
-; CHECK-GI-NEXT:    mla v5.4s, v3.4s, v22.4s
-; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w12
+; CHECK-GI-NEXT:    mov v19.s[3], w15
+; CHECK-GI-NEXT:    mov v3.s[3], w17
+; CHECK-GI-NEXT:    mov v20.s[3], w25
+; CHECK-GI-NEXT:    mov v0.s[3], wzr
+; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v18.4s
+; CHECK-GI-NEXT:    mov v21.s[3], wzr
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v19.4s
+; CHECK-GI-NEXT:    mla v4.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    mla v5.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    add v0.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v4.4s
 ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <25 x i8>, ptr %a
@@ -2455,73 +2477,77 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
 ;
 ; CHECK-GI-LABEL: test_udot_v25i8_nomla:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldrb w17, [x0, #16]!
-; CHECK-GI-NEXT:    ldrb w16, [x0, #4]
-; CHECK-GI-NEXT:    ldrb w14, [x0, #8]
+; CHECK-GI-NEXT:    str x19, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    .cfi_offset w19, -16
+; CHECK-GI-NEXT:    ldp q2, q1, [x0]
 ; CHECK-GI-NEXT:    fmov s0, wzr
-; CHECK-GI-NEXT:    umov w15, v1.b[0]
-; CHECK-GI-NEXT:    umov w2, v1.b[4]
-; CHECK-GI-NEXT:    umov w4, v1.b[8]
-; CHECK-GI-NEXT:    umov w5, v1.b[12]
-; CHECK-GI-NEXT:    umov w1, v1.b[1]
-; CHECK-GI-NEXT:    umov w3, v1.b[5]
-; CHECK-GI-NEXT:    umov w6, v1.b[9]
-; CHECK-GI-NEXT:    umov w7, v1.b[13]
-; CHECK-GI-NEXT:    fmov s6, w17
-; CHECK-GI-NEXT:    fmov s7, w16
-; CHECK-GI-NEXT:    fmov s16, w14
-; CHECK-GI-NEXT:    ldrb w18, [x0, #1]
-; CHECK-GI-NEXT:    fmov s2, w15
-; CHECK-GI-NEXT:    fmov s3, w2
-; CHECK-GI-NEXT:    ldrb w11, [x0, #5]
-; CHECK-GI-NEXT:    fmov s4, w4
-; CHECK-GI-NEXT:    fmov s5, w5
-; CHECK-GI-NEXT:    ldrb w16, [x0, #2]
-; CHECK-GI-NEXT:    umov w9, v1.b[2]
-; CHECK-GI-NEXT:    umov w12, v1.b[6]
-; CHECK-GI-NEXT:    ldrb w17, [x0, #6]
-; CHECK-GI-NEXT:    umov w13, v1.b[10]
-; CHECK-GI-NEXT:    umov w15, v1.b[14]
-; CHECK-GI-NEXT:    mov v2.s[1], w1
-; CHECK-GI-NEXT:    mov v3.s[1], w3
-; CHECK-GI-NEXT:    mov v4.s[1], w6
-; CHECK-GI-NEXT:    mov v5.s[1], w7
-; CHECK-GI-NEXT:    mov v6.s[1], w18
-; CHECK-GI-NEXT:    mov v7.s[1], w11
-; CHECK-GI-NEXT:    mov v16.s[1], wzr
+; CHECK-GI-NEXT:    umov w15, v2.b[0]
+; CHECK-GI-NEXT:    umov w17, v2.b[4]
+; CHECK-GI-NEXT:    umov w0, v2.b[8]
+; CHECK-GI-NEXT:    umov w2, v2.b[12]
+; CHECK-GI-NEXT:    umov w4, v1.b[0]
+; CHECK-GI-NEXT:    umov w6, v1.b[4]
+; CHECK-GI-NEXT:    umov w19, v1.b[8]
+; CHECK-GI-NEXT:    umov w16, v2.b[1]
+; CHECK-GI-NEXT:    umov w18, v2.b[5]
+; CHECK-GI-NEXT:    umov w1, v2.b[9]
+; CHECK-GI-NEXT:    umov w3, v2.b[13]
+; CHECK-GI-NEXT:    umov w5, v1.b[1]
+; CHECK-GI-NEXT:    umov w7, v1.b[5]
+; CHECK-GI-NEXT:    fmov s3, w15
+; CHECK-GI-NEXT:    fmov s4, w17
+; CHECK-GI-NEXT:    fmov s5, w0
+; CHECK-GI-NEXT:    fmov s6, w2
+; CHECK-GI-NEXT:    fmov s7, w4
+; CHECK-GI-NEXT:    fmov s16, w6
+; CHECK-GI-NEXT:    fmov s17, w19
+; CHECK-GI-NEXT:    umov w10, v2.b[2]
+; CHECK-GI-NEXT:    umov w11, v2.b[6]
+; CHECK-GI-NEXT:    umov w12, v2.b[10]
+; CHECK-GI-NEXT:    umov w13, v2.b[14]
+; CHECK-GI-NEXT:    umov w14, v1.b[2]
+; CHECK-GI-NEXT:    umov w15, v1.b[6]
+; CHECK-GI-NEXT:    mov v3.s[1], w16
+; CHECK-GI-NEXT:    mov v4.s[1], w18
+; CHECK-GI-NEXT:    mov v5.s[1], w1
+; CHECK-GI-NEXT:    mov v6.s[1], w3
+; CHECK-GI-NEXT:    mov v7.s[1], w5
+; CHECK-GI-NEXT:    mov v16.s[1], w7
+; CHECK-GI-NEXT:    mov v17.s[1], wzr
 ; CHECK-GI-NEXT:    mov v0.s[1], wzr
-; CHECK-GI-NEXT:    umov w8, v1.b[3]
-; CHECK-GI-NEXT:    umov w10, v1.b[7]
-; CHECK-GI-NEXT:    umov w11, v1.b[11]
-; CHECK-GI-NEXT:    umov w14, v1.b[15]
-; CHECK-GI-NEXT:    mov v2.s[2], w9
-; CHECK-GI-NEXT:    ldrb w9, [x0, #3]
-; CHECK-GI-NEXT:    mov v3.s[2], w12
-; CHECK-GI-NEXT:    ldrb w12, [x0, #7]
-; CHECK-GI-NEXT:    mov v4.s[2], w13
-; CHECK-GI-NEXT:    mov v5.s[2], w15
-; CHECK-GI-NEXT:    mov v6.s[2], w16
-; CHECK-GI-NEXT:    mov v7.s[2], w17
-; CHECK-GI-NEXT:    mov v16.s[2], wzr
+; CHECK-GI-NEXT:    umov w8, v2.b[3]
+; CHECK-GI-NEXT:    umov w9, v2.b[7]
+; CHECK-GI-NEXT:    umov w16, v2.b[11]
+; CHECK-GI-NEXT:    umov w17, v2.b[15]
+; CHECK-GI-NEXT:    umov w18, v1.b[3]
+; CHECK-GI-NEXT:    umov w0, v1.b[7]
+; CHECK-GI-NEXT:    mov v3.s[2], w10
+; CHECK-GI-NEXT:    mov v4.s[2], w11
+; CHECK-GI-NEXT:    mov v5.s[2], w12
+; CHECK-GI-NEXT:    mov v6.s[2], w13
+; CHECK-GI-NEXT:    mov v7.s[2], w14
+; CHECK-GI-NEXT:    mov v16.s[2], w15
+; CHECK-GI-NEXT:    mov v17.s[2], wzr
 ; CHECK-GI-NEXT:    mov v0.s[2], wzr
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w10
-; CHECK-GI-NEXT:    mov v4.s[3], w11
-; CHECK-GI-NEXT:    mov v5.s[3], w14
-; CHECK-GI-NEXT:    mov v6.s[3], w9
-; CHECK-GI-NEXT:    mov v7.s[3], w12
-; CHECK-GI-NEXT:    mov v16.s[3], wzr
+; CHECK-GI-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NEXT:    mov v4.s[3], w9
+; CHECK-GI-NEXT:    mov v5.s[3], w16
+; CHECK-GI-NEXT:    mov v6.s[3], w17
+; CHECK-GI-NEXT:    mov v7.s[3], w18
+; CHECK-GI-NEXT:    mov v16.s[3], w0
+; CHECK-GI-NEXT:    mov v17.s[3], wzr
 ; CHECK-GI-NEXT:    mov v0.s[3], wzr
-; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT:    add v2.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    add v3.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    add v0.4s, v16.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    add v2.4s, v5.4s, v6.4s
+; CHECK-GI-NEXT:    add v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT:    add v0.4s, v17.4s, v0.4s
 ; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ldr x19, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <25 x i8>, ptr %a1
@@ -2554,128 +2580,145 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ;
 ; CHECK-GI-LABEL: test_sdot_v25i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldrsb w9, [x1, #16]!
-; CHECK-GI-NEXT:    ldrsb w11, [x1, #4]
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #5]
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w21, -24
+; CHECK-GI-NEXT:    .cfi_offset w22, -32
+; CHECK-GI-NEXT:    .cfi_offset w23, -40
+; CHECK-GI-NEXT:    .cfi_offset w24, -48
+; CHECK-GI-NEXT:    .cfi_offset w25, -56
+; CHECK-GI-NEXT:    .cfi_offset w26, -64
+; CHECK-GI-NEXT:    ldp q1, q7, [x1]
 ; CHECK-GI-NEXT:    fmov s0, wzr
-; CHECK-GI-NEXT:    smov w13, v2.b[4]
-; CHECK-GI-NEXT:    smov w14, v2.b[5]
-; CHECK-GI-NEXT:    smov w10, v2.b[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    smov w9, v2.b[8]
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    smov w11, v2.b[12]
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldrsb w8, [x1, #1]
+; CHECK-GI-NEXT:    ldp q16, q3, [x0]
+; CHECK-GI-NEXT:    smov w9, v1.b[4]
+; CHECK-GI-NEXT:    smov w11, v1.b[5]
+; CHECK-GI-NEXT:    smov w18, v1.b[0]
+; CHECK-GI-NEXT:    smov w0, v1.b[12]
+; CHECK-GI-NEXT:    smov w3, v7.b[4]
+; CHECK-GI-NEXT:    smov w12, v1.b[1]
+; CHECK-GI-NEXT:    smov w13, v1.b[6]
+; CHECK-GI-NEXT:    smov w1, v1.b[13]
+; CHECK-GI-NEXT:    smov w4, v7.b[5]
+; CHECK-GI-NEXT:    smov w15, v1.b[2]
+; CHECK-GI-NEXT:    smov w8, v1.b[3]
+; CHECK-GI-NEXT:    smov w16, v1.b[7]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    smov w14, v1.b[8]
+; CHECK-GI-NEXT:    smov w17, v1.b[9]
+; CHECK-GI-NEXT:    smov w10, v1.b[10]
+; CHECK-GI-NEXT:    smov w9, v1.b[11]
+; CHECK-GI-NEXT:    smov w5, v1.b[14]
+; CHECK-GI-NEXT:    smov w6, v7.b[0]
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w3
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    smov w11, v1.b[15]
+; CHECK-GI-NEXT:    fmov s1, w18
+; CHECK-GI-NEXT:    smov w7, v7.b[1]
+; CHECK-GI-NEXT:    smov w18, v7.b[6]
+; CHECK-GI-NEXT:    smov w21, v16.b[4]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w4
+; CHECK-GI-NEXT:    fmov s6, w14
+; CHECK-GI-NEXT:    mov v1.s[1], w12
+; CHECK-GI-NEXT:    smov w12, v7.b[3]
+; CHECK-GI-NEXT:    smov w14, v7.b[7]
+; CHECK-GI-NEXT:    mov v2.s[2], w13
+; CHECK-GI-NEXT:    smov w13, v7.b[2]
+; CHECK-GI-NEXT:    smov w0, v7.b[8]
+; CHECK-GI-NEXT:    fmov s7, w6
+; CHECK-GI-NEXT:    smov w23, v16.b[12]
+; CHECK-GI-NEXT:    smov w25, v3.b[4]
+; CHECK-GI-NEXT:    mov v6.s[1], w17
+; CHECK-GI-NEXT:    mov v4.s[2], w5
+; CHECK-GI-NEXT:    mov v5.s[2], w18
+; CHECK-GI-NEXT:    mov v1.s[2], w15
+; CHECK-GI-NEXT:    smov w6, v16.b[0]
+; CHECK-GI-NEXT:    smov w3, v16.b[1]
+; CHECK-GI-NEXT:    mov v2.s[3], w16
+; CHECK-GI-NEXT:    mov v7.s[1], w7
+; CHECK-GI-NEXT:    smov w16, v16.b[2]
+; CHECK-GI-NEXT:    smov w15, v16.b[3]
+; CHECK-GI-NEXT:    smov w22, v16.b[5]
+; CHECK-GI-NEXT:    smov w5, v16.b[6]
+; CHECK-GI-NEXT:    smov w18, v16.b[7]
+; CHECK-GI-NEXT:    smov w19, v16.b[8]
+; CHECK-GI-NEXT:    smov w7, v16.b[9]
+; CHECK-GI-NEXT:    smov w24, v16.b[13]
+; CHECK-GI-NEXT:    smov w1, v16.b[10]
+; CHECK-GI-NEXT:    smov w17, v16.b[11]
+; CHECK-GI-NEXT:    smov w20, v16.b[14]
+; CHECK-GI-NEXT:    smov w4, v16.b[15]
+; CHECK-GI-NEXT:    fmov s16, w21
+; CHECK-GI-NEXT:    smov w21, v3.b[8]
+; CHECK-GI-NEXT:    smov w26, v3.b[5]
+; CHECK-GI-NEXT:    fmov s17, w23
+; CHECK-GI-NEXT:    smov w23, v3.b[0]
+; CHECK-GI-NEXT:    fmov s18, w25
+; CHECK-GI-NEXT:    smov w25, v3.b[3]
+; CHECK-GI-NEXT:    mov v16.s[1], w22
+; CHECK-GI-NEXT:    smov w22, v3.b[1]
+; CHECK-GI-NEXT:    fmov s19, w6
+; CHECK-GI-NEXT:    mov v17.s[1], w24
+; CHECK-GI-NEXT:    smov w24, v3.b[2]
+; CHECK-GI-NEXT:    smov w6, v3.b[7]
+; CHECK-GI-NEXT:    mul w0, w0, w21
+; CHECK-GI-NEXT:    mov v18.s[1], w26
+; CHECK-GI-NEXT:    smov w26, v3.b[6]
+; CHECK-GI-NEXT:    fmov s3, w19
+; CHECK-GI-NEXT:    fmov s20, w23
+; CHECK-GI-NEXT:    mov v19.s[1], w3
+; CHECK-GI-NEXT:    mov v16.s[2], w5
 ; CHECK-GI-NEXT:    mov v0.s[1], wzr
-; CHECK-GI-NEXT:    fmov s7, w13
-; CHECK-GI-NEXT:    fmov s4, w10
-; CHECK-GI-NEXT:    smov w10, v2.b[13]
-; CHECK-GI-NEXT:    mov v5.s[1], w12
-; CHECK-GI-NEXT:    smov w13, v2.b[9]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    fmov s16, w11
-; CHECK-GI-NEXT:    smov w9, v1.b[0]
-; CHECK-GI-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NEXT:    mov v7.s[1], w14
-; CHECK-GI-NEXT:    smov w14, v2.b[6]
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #6]
-; CHECK-GI-NEXT:    smov w8, v2.b[1]
-; CHECK-GI-NEXT:    smov w11, v2.b[2]
-; CHECK-GI-NEXT:    mov v0.s[2], wzr
-; CHECK-GI-NEXT:    mov v16.s[1], w10
-; CHECK-GI-NEXT:    smov w10, v2.b[14]
-; CHECK-GI-NEXT:    mov v5.s[2], w12
-; CHECK-GI-NEXT:    smov w12, v1.b[5]
-; CHECK-GI-NEXT:    mov v6.s[1], w13
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    mov v7.s[2], w14
-; CHECK-GI-NEXT:    smov w14, v1.b[4]
-; CHECK-GI-NEXT:    smov w9, v2.b[10]
-; CHECK-GI-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[1]
-; CHECK-GI-NEXT:    smov w13, v2.b[7]
-; CHECK-GI-NEXT:    mov v16.s[2], w10
-; CHECK-GI-NEXT:    smov w10, v2.b[15]
-; CHECK-GI-NEXT:    mov v0.s[3], wzr
-; CHECK-GI-NEXT:    fmov s18, w14
-; CHECK-GI-NEXT:    mov v6.s[2], w9
-; CHECK-GI-NEXT:    smov w9, v1.b[12]
-; CHECK-GI-NEXT:    mov v4.s[2], w11
-; CHECK-GI-NEXT:    ldrsb w11, [x1, #7]
-; CHECK-GI-NEXT:    mov v17.s[1], w8
-; CHECK-GI-NEXT:    ldrsb w8, [x1, #2]
-; CHECK-GI-NEXT:    mov v16.s[3], w10
-; CHECK-GI-NEXT:    smov w10, v1.b[13]
-; CHECK-GI-NEXT:    mov v18.s[1], w12
-; CHECK-GI-NEXT:    smov w12, v1.b[6]
-; CHECK-GI-NEXT:    mov v5.s[3], w11
-; CHECK-GI-NEXT:    ldrsb w11, [x0, #16]!
-; CHECK-GI-NEXT:    mov v7.s[3], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[2]
-; CHECK-GI-NEXT:    fmov s20, w9
-; CHECK-GI-NEXT:    ldrsb w9, [x0, #5]
-; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[8]
-; CHECK-GI-NEXT:    fmov s22, w11
-; CHECK-GI-NEXT:    mov v18.s[2], w12
-; CHECK-GI-NEXT:    ldrsb w12, [x0, #4]
-; CHECK-GI-NEXT:    smov w11, v2.b[3]
-; CHECK-GI-NEXT:    mov v20.s[1], w10
-; CHECK-GI-NEXT:    ldrsb w10, [x0, #8]
-; CHECK-GI-NEXT:    fmov s21, w12
-; CHECK-GI-NEXT:    ldrsb w12, [x1, #8]
-; CHECK-GI-NEXT:    mov v17.s[2], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[9]
-; CHECK-GI-NEXT:    fmov s19, w8
-; CHECK-GI-NEXT:    smov w8, v1.b[14]
-; CHECK-GI-NEXT:    mul w10, w12, w10
-; CHECK-GI-NEXT:    smov w12, v1.b[7]
+; CHECK-GI-NEXT:    mov v6.s[2], w10
+; CHECK-GI-NEXT:    fmov s21, w0
+; CHECK-GI-NEXT:    mov v17.s[2], w20
 ; CHECK-GI-NEXT:    mov v4.s[3], w11
-; CHECK-GI-NEXT:    mov v21.s[1], w9
-; CHECK-GI-NEXT:    ldrsb w9, [x0, #6]
-; CHECK-GI-NEXT:    mov v19.s[1], w13
-; CHECK-GI-NEXT:    ldrsb w13, [x0, #1]
-; CHECK-GI-NEXT:    mov v20.s[2], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[10]
-; CHECK-GI-NEXT:    mov v18.s[3], w12
-; CHECK-GI-NEXT:    ldrsb w12, [x0, #7]
-; CHECK-GI-NEXT:    mov v21.s[2], w9
-; CHECK-GI-NEXT:    smov w9, v2.b[11]
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    ldrsb w10, [x0, #2]
-; CHECK-GI-NEXT:    mov v22.s[1], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[15]
-; CHECK-GI-NEXT:    mov v2.s[1], wzr
-; CHECK-GI-NEXT:    mov v19.s[2], w8
-; CHECK-GI-NEXT:    smov w8, v1.b[3]
-; CHECK-GI-NEXT:    mov v21.s[3], w12
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v3.s[1], w7
+; CHECK-GI-NEXT:    mov v20.s[1], w22
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[2], w26
+; CHECK-GI-NEXT:    mov v21.s[1], wzr
+; CHECK-GI-NEXT:    mov v16.s[3], w18
+; CHECK-GI-NEXT:    mov v17.s[3], w4
+; CHECK-GI-NEXT:    mov v7.s[2], w13
+; CHECK-GI-NEXT:    mov v5.s[3], w14
+; CHECK-GI-NEXT:    mov v19.s[2], w16
+; CHECK-GI-NEXT:    mov v3.s[2], w1
+; CHECK-GI-NEXT:    mov v0.s[2], wzr
+; CHECK-GI-NEXT:    mov v20.s[2], w24
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v18.s[3], w6
+; CHECK-GI-NEXT:    mov v21.s[2], wzr
+; CHECK-GI-NEXT:    mul v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v6.s[3], w9
-; CHECK-GI-NEXT:    ldrsb w9, [x0, #3]
-; CHECK-GI-NEXT:    mov v20.s[3], w13
-; CHECK-GI-NEXT:    smov w13, v1.b[11]
-; CHECK-GI-NEXT:    mov v22.s[2], w10
-; CHECK-GI-NEXT:    ldrsb w10, [x1, #3]
-; CHECK-GI-NEXT:    mul v1.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT:    mov v2.s[2], wzr
-; CHECK-GI-NEXT:    mov v17.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w10
-; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v21.4s
-; CHECK-GI-NEXT:    mov v19.s[3], w13
-; CHECK-GI-NEXT:    mul v7.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT:    mov v22.s[3], w9
-; CHECK-GI-NEXT:    mov v2.s[3], wzr
-; CHECK-GI-NEXT:    mla v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT:    mla v7.4s, v6.4s, v19.4s
-; CHECK-GI-NEXT:    mla v5.4s, v3.4s, v22.4s
-; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT:    mov v7.s[3], w12
+; CHECK-GI-NEXT:    mov v19.s[3], w15
+; CHECK-GI-NEXT:    mov v3.s[3], w17
+; CHECK-GI-NEXT:    mov v20.s[3], w25
+; CHECK-GI-NEXT:    mov v0.s[3], wzr
+; CHECK-GI-NEXT:    mul v5.4s, v5.4s, v18.4s
+; CHECK-GI-NEXT:    mov v21.s[3], wzr
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v19.4s
+; CHECK-GI-NEXT:    mla v4.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT:    mla v5.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT:    add v0.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v4.4s
 ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <25 x i8>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 7a436eddb23a6..5e278d59b6591 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x1]
-; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
-; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], w10
-; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v1.b[2]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s5
 ; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    add x9, x1, #2
-; CHECK-GI-NEXT:    add x10, x1, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
 ; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]



More information about the llvm-commits mailing list