[llvm] [GlobalISel] Widen vector loads from aligned ptrs (PR #144309)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 20 23:38:57 PDT 2025
https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/144309
>From 38e58052c5671948baecf7715ce6ad15f49e4ddc Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sat, 21 Jun 2025 07:38:41 +0100
Subject: [PATCH] [GlobalISel] Widen vector loads from aligned ptrs
If the pointer is aligned to more than the size of the vector, we can widen the
load up to next power of 2 size, as SDAG performs.
Some of the v3 tests are currently worse - those should be addressed in other
issues.
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 15 +
.../GlobalISel/legalize-load-range.mir | 46 ++
llvm/test/CodeGen/AArch64/add.ll | 30 +-
llvm/test/CodeGen/AArch64/andorxor.ll | 90 +--
llvm/test/CodeGen/AArch64/ctlz.ll | 18 +-
llvm/test/CodeGen/AArch64/ctpop.ll | 18 +-
llvm/test/CodeGen/AArch64/cttz.ll | 38 +-
llvm/test/CodeGen/AArch64/load.ll | 98 +--
llvm/test/CodeGen/AArch64/mul.ll | 30 +-
llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 739 +++++++++---------
llvm/test/CodeGen/AArch64/sub.ll | 30 +-
11 files changed, 598 insertions(+), 554 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 028bffd1bf5a7..a28361051b418 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4072,6 +4072,21 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
if (MemTy != DstTy)
return UnableToLegalize;
+ Align Alignment = LoadMI.getAlign();
+ // Given an alignment larger than the size of the memory, we can increase
+ // the size of the load without needing to scalarize it.
+ if (Alignment.value() * 8 > MemSizeInBits &&
+ isPowerOf2_64(DstTy.getScalarSizeInBits())) {
+ LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
+ DstTy.getElementType());
+ MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
+ auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
+ MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
+ NewLoad.getReg(0));
+ LoadMI.eraseFromParent();
+ return Legalized;
+ }
+
// TODO: We can do better than scalarizing the vector and at least split it
// in half.
return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir
new file mode 100644
index 0000000000000..5611642a13649
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel -o - %s | FileCheck %s
+
+--- |
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+ target triple = "aarch64"
+
+ define <3 x i16> @range_v3i16(ptr %a_ptr, ptr %b_ptr) {
+ %a = load <3 x i16>, ptr %a_ptr, align 8, !range !0, !noundef !1
+ %b = load <3 x i16>, ptr %b_ptr, align 8, !range !2, !noundef !1
+ %result = add <3 x i16> %a, %b
+ ret <3 x i16> %result
+ }
+
+ !0 = !{i16 16, i16 17}
+ !1 = !{}
+ !2 = !{i16 32, i16 33}
+...
+---
+name: range_v3i16
+body: |
+ bb.1 (%ir-block.0):
+ liveins: $x0, $x1
+ ; Make sure we drop the range metadata when widening an aligned load.
+
+ ; CHECK-LABEL: name: range_v3i16
+ ; CHECK: liveins: $x0, $x1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>) from %ir.a_ptr)
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY1]](p0) :: (load (<4 x s16>) from %ir.b_ptr)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s16>) = G_ADD [[LOAD]], [[LOAD1]]
+ ; CHECK-NEXT: $d0 = COPY [[ADD]](<4 x s16>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $d0
+ %0:_(p0) = COPY $x0
+ %1:_(p0) = COPY $x1
+ %2:_(<3 x s16>) = G_LOAD %0(p0) :: (load (<3 x s16>) from %ir.a_ptr, align 8, !range !0)
+ %3:_(<3 x s16>) = G_LOAD %1(p0) :: (load (<3 x s16>) from %ir.b_ptr, align 8, !range !2)
+ %4:_(<3 x s16>) = G_ADD %2, %3
+ %5:_(s16), %6:_(s16), %7:_(s16) = G_UNMERGE_VALUES %4(<3 x s16>)
+ %8:_(s16) = G_IMPLICIT_DEF
+ %9:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %8(s16)
+ $d0 = COPY %9(<4 x s16>)
+ RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index d5bd1b712a2a6..96168cb80196f 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x1]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: ldrb w11, [x1, #1]
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w10
-; CHECK-GI-NEXT: mov v1.h[1], w11
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v1.b[2]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: add x9, x1, #2
-; CHECK-GI-NEXT: add x10, x1, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index f7df1092287bd..a7875dbebd0e6 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -302,16 +302,20 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: and_v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x1]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: ldrb w11, [x1, #1]
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w10
-; CHECK-GI-NEXT: mov v1.h[1], w11
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v1.b[2]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
@@ -350,16 +354,20 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: or_v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x1]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: ldrb w11, [x1, #1]
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w10
-; CHECK-GI-NEXT: mov v1.h[1], w11
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v1.b[2]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b
@@ -398,16 +406,20 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: xor_v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x1]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: ldrb w11, [x1, #1]
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w10
-; CHECK-GI-NEXT: mov v1.h[1], w11
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v1.b[2]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b
@@ -805,16 +817,10 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: and_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: add x9, x1, #2
-; CHECK-GI-NEXT: add x10, x1, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
@@ -842,16 +848,10 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: or_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: add x9, x1, #2
-; CHECK-GI-NEXT: add x10, x1, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
@@ -879,16 +879,10 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: xor_v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: add x9, x1, #2
-; CHECK-GI-NEXT: add x10, x1, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index b1c6e24c30a7d..04124609eec74 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -56,12 +56,16 @@ define void @v3i8(ptr %p1) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr b0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #1
+; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: add x9, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9]
-; CHECK-GI-NEXT: clz v0.8b, v0.8b
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: add x8, x0, #1
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v0.b[2]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v0.b[0]
+; CHECK-GI-NEXT: clz v0.8b, v2.8b
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: clz v0.4h, v0.4h
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 55f75b6bc3f27..c739be95cd243 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -55,12 +55,16 @@ define void @v3i8(ptr %p1) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr b0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #1
+; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: add x9, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9]
-; CHECK-GI-NEXT: cnt v0.8b, v0.8b
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: add x8, x0, #1
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT: mov b0, v0.b[2]
+; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT: mov v2.b[2], v0.b[0]
+; CHECK-GI-NEXT: cnt v0.8b, v2.8b
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
@@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
; CHECK-GI-NEXT: str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index 93ac97e20dabd..fc9bf2c0aca65 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -68,21 +68,23 @@ define void @v3i8(ptr %p1) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w9, [x0]
+; CHECK-GI-NEXT: ldr w9, [x0]
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w9, [x0, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v1.h[1], w10
-; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: fmov s0, w9
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov v2.h[2], w8
; CHECK-GI-NEXT: add x8, x0, #1
-; CHECK-GI-NEXT: mov v1.h[2], w9
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: mov b1, v0.b[2]
+; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: add x9, x0, #2
-; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT: eor v1.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
@@ -275,22 +277,20 @@ define void @v3i16(ptr %p1) {
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: add x9, x0, #2
+; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: add x10, x0, #4
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: mov v0.h[2], w8
+; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
; CHECK-GI-NEXT: str h0, [x0]
-; CHECK-GI-NEXT: st1 { v0.h }[1], [x9]
-; CHECK-GI-NEXT: st1 { v0.h }[2], [x10]
+; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT: st1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: ret
entry:
%d = load <3 x i16>, ptr %p1
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 6b26ae98a4ed8..c4bb6e37d6eaf 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -335,102 +335,50 @@ define <3 x i8> @load_v3i8(ptr %ptr) {
;
; CHECK-GI-LABEL: load_v3i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w1, [x0, #1]
-; CHECK-GI-NEXT: ldrb w2, [x0, #2]
-; CHECK-GI-NEXT: mov w0, w8
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b2, v0.b[2]
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: fmov w1, s1
+; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%a = load <3 x i8>, ptr %ptr
ret <3 x i8> %a
}
define <7 x i8> @load_v7i8(ptr %ptr) {
-; CHECK-SD-LABEL: load_v7i8:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: load_v7i8:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr b0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #1
-; CHECK-GI-NEXT: mov v0.b[0], v0.b[0]
-; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
-; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.b }[2], [x8]
-; CHECK-GI-NEXT: add x8, x0, #3
-; CHECK-GI-NEXT: ld1 { v0.b }[3], [x8]
-; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-GI-NEXT: add x8, x0, #5
-; CHECK-GI-NEXT: ld1 { v0.b }[5], [x8]
-; CHECK-GI-NEXT: add x8, x0, #6
-; CHECK-GI-NEXT: ld1 { v0.b }[6], [x8]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: load_v7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ret
%a = load <7 x i8>, ptr %ptr
ret <7 x i8> %a
}
define <3 x i16> @load_v3i16(ptr %ptr) {
-; CHECK-SD-LABEL: load_v3i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr d0, [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: load_v3i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: load_v3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ret
%a = load <3 x i16>, ptr %ptr
ret <3 x i16> %a
}
define <7 x i16> @load_v7i16(ptr %ptr) {
-; CHECK-SD-LABEL: load_v7i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: load_v7i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-GI-NEXT: add x8, x0, #6
-; CHECK-GI-NEXT: ld1 { v0.h }[3], [x8]
-; CHECK-GI-NEXT: add x8, x0, #8
-; CHECK-GI-NEXT: ld1 { v0.h }[4], [x8]
-; CHECK-GI-NEXT: add x8, x0, #10
-; CHECK-GI-NEXT: ld1 { v0.h }[5], [x8]
-; CHECK-GI-NEXT: add x8, x0, #12
-; CHECK-GI-NEXT: ld1 { v0.h }[6], [x8]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: load_v7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ret
%a = load <7 x i16>, ptr %ptr
ret <7 x i16> %a
}
define <3 x i32> @load_v3i32(ptr %ptr) {
-; CHECK-SD-LABEL: load_v3i32:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr q0, [x0]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: load_v3i32:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: add x8, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.s }[1], [x8]
-; CHECK-GI-NEXT: add x8, x0, #8
-; CHECK-GI-NEXT: ld1 { v0.s }[2], [x8]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: load_v3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ret
%a = load <3 x i32>, ptr %ptr
ret <3 x i32> %a
}
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 1558043f7f40a..9c69a6f03b858 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -122,16 +122,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x1]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: ldrb w11, [x1, #1]
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w10
-; CHECK-GI-NEXT: mov v1.h[1], w11
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v1.b[2]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h
@@ -282,16 +286,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: add x9, x1, #2
-; CHECK-GI-NEXT: add x10, x1, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index a534112b7c559..4f0c4080aa0ce 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -412,31 +412,33 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
;
; CHECK-GI-LABEL: test_udot_v5i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0, #4]
-; CHECK-GI-NEXT: ldrb w9, [x1, #4]
-; CHECK-GI-NEXT: ldrb w10, [x1]
-; CHECK-GI-NEXT: ldrb w11, [x0, #1]
-; CHECK-GI-NEXT: ldrb w12, [x1, #1]
-; CHECK-GI-NEXT: mul w8, w9, w8
-; CHECK-GI-NEXT: ldrb w9, [x0]
-; CHECK-GI-NEXT: fmov s0, w10
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: mov v0.s[1], w12
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: mov v1.s[1], w11
-; CHECK-GI-NEXT: mov v2.s[1], wzr
-; CHECK-GI-NEXT: mov v0.s[2], w9
-; CHECK-GI-NEXT: ldrb w9, [x1, #3]
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: ldrb w8, [x0, #3]
-; CHECK-GI-NEXT: mov v2.s[2], wzr
-; CHECK-GI-NEXT: mov v0.s[3], w9
-; CHECK-GI-NEXT: mov v1.s[3], w8
-; CHECK-GI-NEXT: mov v2.s[3], wzr
-; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: addv s0, v2.4s
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: umov w8, v1.b[4]
+; CHECK-GI-NEXT: umov w9, v0.b[4]
+; CHECK-GI-NEXT: umov w10, v1.b[0]
+; CHECK-GI-NEXT: umov w12, v0.b[0]
+; CHECK-GI-NEXT: umov w11, v1.b[1]
+; CHECK-GI-NEXT: umov w13, v0.b[1]
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: fmov s2, w10
+; CHECK-GI-NEXT: umov w9, v1.b[2]
+; CHECK-GI-NEXT: fmov s3, w12
+; CHECK-GI-NEXT: umov w10, v1.b[3]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: umov w8, v0.b[2]
+; CHECK-GI-NEXT: mov v3.s[1], w13
+; CHECK-GI-NEXT: umov w11, v0.b[3]
+; CHECK-GI-NEXT: mov v4.s[1], wzr
+; CHECK-GI-NEXT: mov v2.s[2], w9
+; CHECK-GI-NEXT: mov v3.s[2], w8
+; CHECK-GI-NEXT: mov v4.s[2], wzr
+; CHECK-GI-NEXT: mov v2.s[3], w10
+; CHECK-GI-NEXT: mov v3.s[3], w11
+; CHECK-GI-NEXT: mov v4.s[3], wzr
+; CHECK-GI-NEXT: mla v4.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v4.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w2
; CHECK-GI-NEXT: ret
@@ -466,20 +468,21 @@ define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
;
; CHECK-GI-LABEL: test_udot_v5i8_nomla:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x0, #4]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: mov v0.s[1], w10
-; CHECK-GI-NEXT: mov v1.s[1], wzr
-; CHECK-GI-NEXT: mov v0.s[2], w8
-; CHECK-GI-NEXT: mov v1.s[2], wzr
-; CHECK-GI-NEXT: ldrb w8, [x0, #3]
-; CHECK-GI-NEXT: mov v0.s[3], w8
-; CHECK-GI-NEXT: mov v1.s[3], wzr
-; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: umov w8, v0.b[0]
+; CHECK-GI-NEXT: umov w9, v0.b[4]
+; CHECK-GI-NEXT: umov w10, v0.b[1]
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: umov w8, v0.b[2]
+; CHECK-GI-NEXT: umov w9, v0.b[3]
+; CHECK-GI-NEXT: mov v1.s[1], w10
+; CHECK-GI-NEXT: mov v2.s[1], wzr
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: mov v2.s[2], wzr
+; CHECK-GI-NEXT: mov v1.s[3], w9
+; CHECK-GI-NEXT: mov v2.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v2.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
@@ -506,31 +509,33 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
;
; CHECK-GI-LABEL: test_sdot_v5i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrsb w8, [x0, #4]
-; CHECK-GI-NEXT: ldrsb w9, [x1, #4]
-; CHECK-GI-NEXT: ldrsb w10, [x1]
-; CHECK-GI-NEXT: ldrsb w11, [x0, #1]
-; CHECK-GI-NEXT: ldrsb w12, [x1, #1]
-; CHECK-GI-NEXT: mul w8, w9, w8
-; CHECK-GI-NEXT: ldrsb w9, [x0]
-; CHECK-GI-NEXT: fmov s0, w10
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrsb w9, [x1, #2]
-; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: mov v0.s[1], w12
-; CHECK-GI-NEXT: ldrsb w8, [x0, #2]
-; CHECK-GI-NEXT: mov v1.s[1], w11
-; CHECK-GI-NEXT: mov v2.s[1], wzr
-; CHECK-GI-NEXT: mov v0.s[2], w9
-; CHECK-GI-NEXT: ldrsb w9, [x1, #3]
-; CHECK-GI-NEXT: mov v1.s[2], w8
-; CHECK-GI-NEXT: ldrsb w8, [x0, #3]
-; CHECK-GI-NEXT: mov v2.s[2], wzr
-; CHECK-GI-NEXT: mov v0.s[3], w9
-; CHECK-GI-NEXT: mov v1.s[3], w8
-; CHECK-GI-NEXT: mov v2.s[3], wzr
-; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: addv s0, v2.4s
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: smov w8, v1.b[4]
+; CHECK-GI-NEXT: smov w9, v0.b[4]
+; CHECK-GI-NEXT: smov w10, v1.b[0]
+; CHECK-GI-NEXT: smov w12, v0.b[0]
+; CHECK-GI-NEXT: smov w11, v1.b[1]
+; CHECK-GI-NEXT: smov w13, v0.b[1]
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: fmov s2, w10
+; CHECK-GI-NEXT: smov w9, v1.b[2]
+; CHECK-GI-NEXT: fmov s3, w12
+; CHECK-GI-NEXT: smov w10, v1.b[3]
+; CHECK-GI-NEXT: fmov s4, w8
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: smov w8, v0.b[2]
+; CHECK-GI-NEXT: mov v3.s[1], w13
+; CHECK-GI-NEXT: smov w11, v0.b[3]
+; CHECK-GI-NEXT: mov v4.s[1], wzr
+; CHECK-GI-NEXT: mov v2.s[2], w9
+; CHECK-GI-NEXT: mov v3.s[2], w8
+; CHECK-GI-NEXT: mov v4.s[2], wzr
+; CHECK-GI-NEXT: mov v2.s[3], w10
+; CHECK-GI-NEXT: mov v3.s[3], w11
+; CHECK-GI-NEXT: mov v4.s[3], wzr
+; CHECK-GI-NEXT: mla v4.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v4.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w2
; CHECK-GI-NEXT: ret
@@ -2298,128 +2303,145 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
;
; CHECK-GI-LABEL: test_udot_v25i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr q2, [x1]
-; CHECK-GI-NEXT: ldrb w9, [x1, #16]!
-; CHECK-GI-NEXT: ldrb w11, [x1, #4]
-; CHECK-GI-NEXT: ldrb w12, [x1, #5]
+; CHECK-GI-NEXT: stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w20, -16
+; CHECK-GI-NEXT: .cfi_offset w21, -24
+; CHECK-GI-NEXT: .cfi_offset w22, -32
+; CHECK-GI-NEXT: .cfi_offset w23, -40
+; CHECK-GI-NEXT: .cfi_offset w24, -48
+; CHECK-GI-NEXT: .cfi_offset w25, -56
+; CHECK-GI-NEXT: .cfi_offset w26, -64
+; CHECK-GI-NEXT: ldp q1, q7, [x1]
; CHECK-GI-NEXT: fmov s0, wzr
-; CHECK-GI-NEXT: umov w13, v2.b[4]
-; CHECK-GI-NEXT: umov w14, v2.b[5]
-; CHECK-GI-NEXT: umov w10, v2.b[0]
-; CHECK-GI-NEXT: fmov s3, w9
-; CHECK-GI-NEXT: umov w9, v2.b[8]
-; CHECK-GI-NEXT: fmov s5, w11
-; CHECK-GI-NEXT: umov w11, v2.b[12]
-; CHECK-GI-NEXT: ldr q1, [x0]
-; CHECK-GI-NEXT: ldrb w8, [x1, #1]
+; CHECK-GI-NEXT: ldp q16, q3, [x0]
+; CHECK-GI-NEXT: umov w9, v1.b[4]
+; CHECK-GI-NEXT: umov w11, v1.b[5]
+; CHECK-GI-NEXT: umov w18, v1.b[0]
+; CHECK-GI-NEXT: umov w0, v1.b[12]
+; CHECK-GI-NEXT: umov w3, v7.b[4]
+; CHECK-GI-NEXT: umov w12, v1.b[1]
+; CHECK-GI-NEXT: umov w13, v1.b[6]
+; CHECK-GI-NEXT: umov w1, v1.b[13]
+; CHECK-GI-NEXT: umov w4, v7.b[5]
+; CHECK-GI-NEXT: umov w15, v1.b[2]
+; CHECK-GI-NEXT: umov w8, v1.b[3]
+; CHECK-GI-NEXT: umov w16, v1.b[7]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: umov w14, v1.b[8]
+; CHECK-GI-NEXT: umov w17, v1.b[9]
+; CHECK-GI-NEXT: umov w10, v1.b[10]
+; CHECK-GI-NEXT: umov w9, v1.b[11]
+; CHECK-GI-NEXT: umov w5, v1.b[14]
+; CHECK-GI-NEXT: umov w6, v7.b[0]
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w3
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: umov w11, v1.b[15]
+; CHECK-GI-NEXT: fmov s1, w18
+; CHECK-GI-NEXT: umov w7, v7.b[1]
+; CHECK-GI-NEXT: umov w18, v7.b[6]
+; CHECK-GI-NEXT: umov w21, v16.b[4]
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w4
+; CHECK-GI-NEXT: fmov s6, w14
+; CHECK-GI-NEXT: mov v1.s[1], w12
+; CHECK-GI-NEXT: umov w12, v7.b[3]
+; CHECK-GI-NEXT: umov w14, v7.b[7]
+; CHECK-GI-NEXT: mov v2.s[2], w13
+; CHECK-GI-NEXT: umov w13, v7.b[2]
+; CHECK-GI-NEXT: umov w0, v7.b[8]
+; CHECK-GI-NEXT: fmov s7, w6
+; CHECK-GI-NEXT: umov w23, v16.b[12]
+; CHECK-GI-NEXT: umov w25, v3.b[4]
+; CHECK-GI-NEXT: mov v6.s[1], w17
+; CHECK-GI-NEXT: mov v4.s[2], w5
+; CHECK-GI-NEXT: mov v5.s[2], w18
+; CHECK-GI-NEXT: mov v1.s[2], w15
+; CHECK-GI-NEXT: umov w6, v16.b[0]
+; CHECK-GI-NEXT: umov w3, v16.b[1]
+; CHECK-GI-NEXT: mov v2.s[3], w16
+; CHECK-GI-NEXT: mov v7.s[1], w7
+; CHECK-GI-NEXT: umov w16, v16.b[2]
+; CHECK-GI-NEXT: umov w15, v16.b[3]
+; CHECK-GI-NEXT: umov w22, v16.b[5]
+; CHECK-GI-NEXT: umov w5, v16.b[6]
+; CHECK-GI-NEXT: umov w18, v16.b[7]
+; CHECK-GI-NEXT: umov w19, v16.b[8]
+; CHECK-GI-NEXT: umov w7, v16.b[9]
+; CHECK-GI-NEXT: umov w24, v16.b[13]
+; CHECK-GI-NEXT: umov w1, v16.b[10]
+; CHECK-GI-NEXT: umov w17, v16.b[11]
+; CHECK-GI-NEXT: umov w20, v16.b[14]
+; CHECK-GI-NEXT: umov w4, v16.b[15]
+; CHECK-GI-NEXT: fmov s16, w21
+; CHECK-GI-NEXT: umov w21, v3.b[8]
+; CHECK-GI-NEXT: umov w26, v3.b[5]
+; CHECK-GI-NEXT: fmov s17, w23
+; CHECK-GI-NEXT: umov w23, v3.b[0]
+; CHECK-GI-NEXT: fmov s18, w25
+; CHECK-GI-NEXT: umov w25, v3.b[3]
+; CHECK-GI-NEXT: mov v16.s[1], w22
+; CHECK-GI-NEXT: umov w22, v3.b[1]
+; CHECK-GI-NEXT: fmov s19, w6
+; CHECK-GI-NEXT: mov v17.s[1], w24
+; CHECK-GI-NEXT: umov w24, v3.b[2]
+; CHECK-GI-NEXT: umov w6, v3.b[7]
+; CHECK-GI-NEXT: mul w0, w0, w21
+; CHECK-GI-NEXT: mov v18.s[1], w26
+; CHECK-GI-NEXT: umov w26, v3.b[6]
+; CHECK-GI-NEXT: fmov s3, w19
+; CHECK-GI-NEXT: fmov s20, w23
+; CHECK-GI-NEXT: mov v19.s[1], w3
+; CHECK-GI-NEXT: mov v16.s[2], w5
; CHECK-GI-NEXT: mov v0.s[1], wzr
-; CHECK-GI-NEXT: fmov s7, w13
-; CHECK-GI-NEXT: fmov s4, w10
-; CHECK-GI-NEXT: umov w10, v2.b[13]
-; CHECK-GI-NEXT: mov v5.s[1], w12
-; CHECK-GI-NEXT: umov w13, v2.b[9]
-; CHECK-GI-NEXT: fmov s6, w9
-; CHECK-GI-NEXT: fmov s16, w11
-; CHECK-GI-NEXT: umov w9, v1.b[0]
-; CHECK-GI-NEXT: mov v3.s[1], w8
-; CHECK-GI-NEXT: mov v7.s[1], w14
-; CHECK-GI-NEXT: umov w14, v2.b[6]
-; CHECK-GI-NEXT: ldrb w12, [x1, #6]
-; CHECK-GI-NEXT: umov w8, v2.b[1]
-; CHECK-GI-NEXT: umov w11, v2.b[2]
-; CHECK-GI-NEXT: mov v0.s[2], wzr
-; CHECK-GI-NEXT: mov v16.s[1], w10
-; CHECK-GI-NEXT: umov w10, v2.b[14]
-; CHECK-GI-NEXT: mov v5.s[2], w12
-; CHECK-GI-NEXT: umov w12, v1.b[5]
-; CHECK-GI-NEXT: mov v6.s[1], w13
-; CHECK-GI-NEXT: fmov s17, w9
-; CHECK-GI-NEXT: mov v7.s[2], w14
-; CHECK-GI-NEXT: umov w14, v1.b[4]
-; CHECK-GI-NEXT: umov w9, v2.b[10]
-; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: umov w8, v1.b[1]
-; CHECK-GI-NEXT: umov w13, v2.b[7]
-; CHECK-GI-NEXT: mov v16.s[2], w10
-; CHECK-GI-NEXT: umov w10, v2.b[15]
-; CHECK-GI-NEXT: mov v0.s[3], wzr
-; CHECK-GI-NEXT: fmov s18, w14
-; CHECK-GI-NEXT: mov v6.s[2], w9
-; CHECK-GI-NEXT: umov w9, v1.b[12]
-; CHECK-GI-NEXT: mov v4.s[2], w11
-; CHECK-GI-NEXT: ldrb w11, [x1, #7]
-; CHECK-GI-NEXT: mov v17.s[1], w8
-; CHECK-GI-NEXT: ldrb w8, [x1, #2]
-; CHECK-GI-NEXT: mov v16.s[3], w10
-; CHECK-GI-NEXT: umov w10, v1.b[13]
-; CHECK-GI-NEXT: mov v18.s[1], w12
-; CHECK-GI-NEXT: umov w12, v1.b[6]
-; CHECK-GI-NEXT: mov v5.s[3], w11
-; CHECK-GI-NEXT: ldrb w11, [x0, #16]!
-; CHECK-GI-NEXT: mov v7.s[3], w13
-; CHECK-GI-NEXT: umov w13, v1.b[2]
-; CHECK-GI-NEXT: fmov s20, w9
-; CHECK-GI-NEXT: ldrb w9, [x0, #5]
-; CHECK-GI-NEXT: mov v3.s[2], w8
-; CHECK-GI-NEXT: umov w8, v1.b[8]
-; CHECK-GI-NEXT: fmov s22, w11
-; CHECK-GI-NEXT: mov v18.s[2], w12
-; CHECK-GI-NEXT: ldrb w12, [x0, #4]
-; CHECK-GI-NEXT: umov w11, v2.b[3]
-; CHECK-GI-NEXT: mov v20.s[1], w10
-; CHECK-GI-NEXT: ldrb w10, [x0, #8]
-; CHECK-GI-NEXT: fmov s21, w12
-; CHECK-GI-NEXT: ldrb w12, [x1, #8]
-; CHECK-GI-NEXT: mov v17.s[2], w13
-; CHECK-GI-NEXT: umov w13, v1.b[9]
-; CHECK-GI-NEXT: fmov s19, w8
-; CHECK-GI-NEXT: umov w8, v1.b[14]
-; CHECK-GI-NEXT: mul w10, w12, w10
-; CHECK-GI-NEXT: umov w12, v1.b[7]
+; CHECK-GI-NEXT: mov v6.s[2], w10
+; CHECK-GI-NEXT: fmov s21, w0
+; CHECK-GI-NEXT: mov v17.s[2], w20
; CHECK-GI-NEXT: mov v4.s[3], w11
-; CHECK-GI-NEXT: mov v21.s[1], w9
-; CHECK-GI-NEXT: ldrb w9, [x0, #6]
-; CHECK-GI-NEXT: mov v19.s[1], w13
-; CHECK-GI-NEXT: ldrb w13, [x0, #1]
-; CHECK-GI-NEXT: mov v20.s[2], w8
-; CHECK-GI-NEXT: umov w8, v1.b[10]
-; CHECK-GI-NEXT: mov v18.s[3], w12
-; CHECK-GI-NEXT: ldrb w12, [x0, #7]
-; CHECK-GI-NEXT: mov v21.s[2], w9
-; CHECK-GI-NEXT: umov w9, v2.b[11]
-; CHECK-GI-NEXT: fmov s2, w10
-; CHECK-GI-NEXT: ldrb w10, [x0, #2]
-; CHECK-GI-NEXT: mov v22.s[1], w13
-; CHECK-GI-NEXT: umov w13, v1.b[15]
-; CHECK-GI-NEXT: mov v2.s[1], wzr
-; CHECK-GI-NEXT: mov v19.s[2], w8
-; CHECK-GI-NEXT: umov w8, v1.b[3]
-; CHECK-GI-NEXT: mov v21.s[3], w12
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.s[1], w7
+; CHECK-GI-NEXT: mov v20.s[1], w22
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v18.s[2], w26
+; CHECK-GI-NEXT: mov v21.s[1], wzr
+; CHECK-GI-NEXT: mov v16.s[3], w18
+; CHECK-GI-NEXT: mov v17.s[3], w4
+; CHECK-GI-NEXT: mov v7.s[2], w13
+; CHECK-GI-NEXT: mov v5.s[3], w14
+; CHECK-GI-NEXT: mov v19.s[2], w16
+; CHECK-GI-NEXT: mov v3.s[2], w1
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: mov v20.s[2], w24
+; CHECK-GI-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v18.s[3], w6
+; CHECK-GI-NEXT: mov v21.s[2], wzr
+; CHECK-GI-NEXT: mul v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT: mul v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT: mov v1.s[3], w8
; CHECK-GI-NEXT: mov v6.s[3], w9
-; CHECK-GI-NEXT: ldrb w9, [x0, #3]
-; CHECK-GI-NEXT: mov v20.s[3], w13
-; CHECK-GI-NEXT: umov w13, v1.b[11]
-; CHECK-GI-NEXT: mov v22.s[2], w10
-; CHECK-GI-NEXT: ldrb w10, [x1, #3]
-; CHECK-GI-NEXT: mul v1.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT: mov v2.s[2], wzr
-; CHECK-GI-NEXT: mov v17.s[3], w8
-; CHECK-GI-NEXT: mov v3.s[3], w10
-; CHECK-GI-NEXT: mul v5.4s, v5.4s, v21.4s
-; CHECK-GI-NEXT: mov v19.s[3], w13
-; CHECK-GI-NEXT: mul v7.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT: mov v22.s[3], w9
-; CHECK-GI-NEXT: mov v2.s[3], wzr
-; CHECK-GI-NEXT: mla v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT: mla v7.4s, v6.4s, v19.4s
-; CHECK-GI-NEXT: mla v5.4s, v3.4s, v22.4s
-; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT: add v1.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: mov v19.s[3], w15
+; CHECK-GI-NEXT: mov v3.s[3], w17
+; CHECK-GI-NEXT: mov v20.s[3], w25
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: mul v5.4s, v5.4s, v18.4s
+; CHECK-GI-NEXT: mov v21.s[3], wzr
+; CHECK-GI-NEXT: mla v2.4s, v1.4s, v19.4s
+; CHECK-GI-NEXT: mla v4.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT: mla v5.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT: add v0.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v4.4s
; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ldp x26, x25, [sp], #64 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
%0 = load <25 x i8>, ptr %a
@@ -2455,73 +2477,77 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
;
; CHECK-GI-LABEL: test_udot_v25i8_nomla:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr q1, [x0]
-; CHECK-GI-NEXT: ldrb w17, [x0, #16]!
-; CHECK-GI-NEXT: ldrb w16, [x0, #4]
-; CHECK-GI-NEXT: ldrb w14, [x0, #8]
+; CHECK-GI-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: .cfi_offset w19, -16
+; CHECK-GI-NEXT: ldp q2, q1, [x0]
; CHECK-GI-NEXT: fmov s0, wzr
-; CHECK-GI-NEXT: umov w15, v1.b[0]
-; CHECK-GI-NEXT: umov w2, v1.b[4]
-; CHECK-GI-NEXT: umov w4, v1.b[8]
-; CHECK-GI-NEXT: umov w5, v1.b[12]
-; CHECK-GI-NEXT: umov w1, v1.b[1]
-; CHECK-GI-NEXT: umov w3, v1.b[5]
-; CHECK-GI-NEXT: umov w6, v1.b[9]
-; CHECK-GI-NEXT: umov w7, v1.b[13]
-; CHECK-GI-NEXT: fmov s6, w17
-; CHECK-GI-NEXT: fmov s7, w16
-; CHECK-GI-NEXT: fmov s16, w14
-; CHECK-GI-NEXT: ldrb w18, [x0, #1]
-; CHECK-GI-NEXT: fmov s2, w15
-; CHECK-GI-NEXT: fmov s3, w2
-; CHECK-GI-NEXT: ldrb w11, [x0, #5]
-; CHECK-GI-NEXT: fmov s4, w4
-; CHECK-GI-NEXT: fmov s5, w5
-; CHECK-GI-NEXT: ldrb w16, [x0, #2]
-; CHECK-GI-NEXT: umov w9, v1.b[2]
-; CHECK-GI-NEXT: umov w12, v1.b[6]
-; CHECK-GI-NEXT: ldrb w17, [x0, #6]
-; CHECK-GI-NEXT: umov w13, v1.b[10]
-; CHECK-GI-NEXT: umov w15, v1.b[14]
-; CHECK-GI-NEXT: mov v2.s[1], w1
-; CHECK-GI-NEXT: mov v3.s[1], w3
-; CHECK-GI-NEXT: mov v4.s[1], w6
-; CHECK-GI-NEXT: mov v5.s[1], w7
-; CHECK-GI-NEXT: mov v6.s[1], w18
-; CHECK-GI-NEXT: mov v7.s[1], w11
-; CHECK-GI-NEXT: mov v16.s[1], wzr
+; CHECK-GI-NEXT: umov w15, v2.b[0]
+; CHECK-GI-NEXT: umov w17, v2.b[4]
+; CHECK-GI-NEXT: umov w0, v2.b[8]
+; CHECK-GI-NEXT: umov w2, v2.b[12]
+; CHECK-GI-NEXT: umov w4, v1.b[0]
+; CHECK-GI-NEXT: umov w6, v1.b[4]
+; CHECK-GI-NEXT: umov w19, v1.b[8]
+; CHECK-GI-NEXT: umov w16, v2.b[1]
+; CHECK-GI-NEXT: umov w18, v2.b[5]
+; CHECK-GI-NEXT: umov w1, v2.b[9]
+; CHECK-GI-NEXT: umov w3, v2.b[13]
+; CHECK-GI-NEXT: umov w5, v1.b[1]
+; CHECK-GI-NEXT: umov w7, v1.b[5]
+; CHECK-GI-NEXT: fmov s3, w15
+; CHECK-GI-NEXT: fmov s4, w17
+; CHECK-GI-NEXT: fmov s5, w0
+; CHECK-GI-NEXT: fmov s6, w2
+; CHECK-GI-NEXT: fmov s7, w4
+; CHECK-GI-NEXT: fmov s16, w6
+; CHECK-GI-NEXT: fmov s17, w19
+; CHECK-GI-NEXT: umov w10, v2.b[2]
+; CHECK-GI-NEXT: umov w11, v2.b[6]
+; CHECK-GI-NEXT: umov w12, v2.b[10]
+; CHECK-GI-NEXT: umov w13, v2.b[14]
+; CHECK-GI-NEXT: umov w14, v1.b[2]
+; CHECK-GI-NEXT: umov w15, v1.b[6]
+; CHECK-GI-NEXT: mov v3.s[1], w16
+; CHECK-GI-NEXT: mov v4.s[1], w18
+; CHECK-GI-NEXT: mov v5.s[1], w1
+; CHECK-GI-NEXT: mov v6.s[1], w3
+; CHECK-GI-NEXT: mov v7.s[1], w5
+; CHECK-GI-NEXT: mov v16.s[1], w7
+; CHECK-GI-NEXT: mov v17.s[1], wzr
; CHECK-GI-NEXT: mov v0.s[1], wzr
-; CHECK-GI-NEXT: umov w8, v1.b[3]
-; CHECK-GI-NEXT: umov w10, v1.b[7]
-; CHECK-GI-NEXT: umov w11, v1.b[11]
-; CHECK-GI-NEXT: umov w14, v1.b[15]
-; CHECK-GI-NEXT: mov v2.s[2], w9
-; CHECK-GI-NEXT: ldrb w9, [x0, #3]
-; CHECK-GI-NEXT: mov v3.s[2], w12
-; CHECK-GI-NEXT: ldrb w12, [x0, #7]
-; CHECK-GI-NEXT: mov v4.s[2], w13
-; CHECK-GI-NEXT: mov v5.s[2], w15
-; CHECK-GI-NEXT: mov v6.s[2], w16
-; CHECK-GI-NEXT: mov v7.s[2], w17
-; CHECK-GI-NEXT: mov v16.s[2], wzr
+; CHECK-GI-NEXT: umov w8, v2.b[3]
+; CHECK-GI-NEXT: umov w9, v2.b[7]
+; CHECK-GI-NEXT: umov w16, v2.b[11]
+; CHECK-GI-NEXT: umov w17, v2.b[15]
+; CHECK-GI-NEXT: umov w18, v1.b[3]
+; CHECK-GI-NEXT: umov w0, v1.b[7]
+; CHECK-GI-NEXT: mov v3.s[2], w10
+; CHECK-GI-NEXT: mov v4.s[2], w11
+; CHECK-GI-NEXT: mov v5.s[2], w12
+; CHECK-GI-NEXT: mov v6.s[2], w13
+; CHECK-GI-NEXT: mov v7.s[2], w14
+; CHECK-GI-NEXT: mov v16.s[2], w15
+; CHECK-GI-NEXT: mov v17.s[2], wzr
; CHECK-GI-NEXT: mov v0.s[2], wzr
-; CHECK-GI-NEXT: mov v2.s[3], w8
-; CHECK-GI-NEXT: mov v3.s[3], w10
-; CHECK-GI-NEXT: mov v4.s[3], w11
-; CHECK-GI-NEXT: mov v5.s[3], w14
-; CHECK-GI-NEXT: mov v6.s[3], w9
-; CHECK-GI-NEXT: mov v7.s[3], w12
-; CHECK-GI-NEXT: mov v16.s[3], wzr
+; CHECK-GI-NEXT: mov v3.s[3], w8
+; CHECK-GI-NEXT: mov v4.s[3], w9
+; CHECK-GI-NEXT: mov v5.s[3], w16
+; CHECK-GI-NEXT: mov v6.s[3], w17
+; CHECK-GI-NEXT: mov v7.s[3], w18
+; CHECK-GI-NEXT: mov v16.s[3], w0
+; CHECK-GI-NEXT: mov v17.s[3], wzr
; CHECK-GI-NEXT: mov v0.s[3], wzr
-; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT: add v0.4s, v16.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: add v2.4s, v5.4s, v6.4s
+; CHECK-GI-NEXT: add v3.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT: add v0.4s, v17.4s, v0.4s
; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
%0 = load <25 x i8>, ptr %a1
@@ -2554,128 +2580,145 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b
;
; CHECK-GI-LABEL: test_sdot_v25i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr q2, [x1]
-; CHECK-GI-NEXT: ldrsb w9, [x1, #16]!
-; CHECK-GI-NEXT: ldrsb w11, [x1, #4]
-; CHECK-GI-NEXT: ldrsb w12, [x1, #5]
+; CHECK-GI-NEXT: stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w20, -16
+; CHECK-GI-NEXT: .cfi_offset w21, -24
+; CHECK-GI-NEXT: .cfi_offset w22, -32
+; CHECK-GI-NEXT: .cfi_offset w23, -40
+; CHECK-GI-NEXT: .cfi_offset w24, -48
+; CHECK-GI-NEXT: .cfi_offset w25, -56
+; CHECK-GI-NEXT: .cfi_offset w26, -64
+; CHECK-GI-NEXT: ldp q1, q7, [x1]
; CHECK-GI-NEXT: fmov s0, wzr
-; CHECK-GI-NEXT: smov w13, v2.b[4]
-; CHECK-GI-NEXT: smov w14, v2.b[5]
-; CHECK-GI-NEXT: smov w10, v2.b[0]
-; CHECK-GI-NEXT: fmov s3, w9
-; CHECK-GI-NEXT: smov w9, v2.b[8]
-; CHECK-GI-NEXT: fmov s5, w11
-; CHECK-GI-NEXT: smov w11, v2.b[12]
-; CHECK-GI-NEXT: ldr q1, [x0]
-; CHECK-GI-NEXT: ldrsb w8, [x1, #1]
+; CHECK-GI-NEXT: ldp q16, q3, [x0]
+; CHECK-GI-NEXT: smov w9, v1.b[4]
+; CHECK-GI-NEXT: smov w11, v1.b[5]
+; CHECK-GI-NEXT: smov w18, v1.b[0]
+; CHECK-GI-NEXT: smov w0, v1.b[12]
+; CHECK-GI-NEXT: smov w3, v7.b[4]
+; CHECK-GI-NEXT: smov w12, v1.b[1]
+; CHECK-GI-NEXT: smov w13, v1.b[6]
+; CHECK-GI-NEXT: smov w1, v1.b[13]
+; CHECK-GI-NEXT: smov w4, v7.b[5]
+; CHECK-GI-NEXT: smov w15, v1.b[2]
+; CHECK-GI-NEXT: smov w8, v1.b[3]
+; CHECK-GI-NEXT: smov w16, v1.b[7]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: smov w14, v1.b[8]
+; CHECK-GI-NEXT: smov w17, v1.b[9]
+; CHECK-GI-NEXT: smov w10, v1.b[10]
+; CHECK-GI-NEXT: smov w9, v1.b[11]
+; CHECK-GI-NEXT: smov w5, v1.b[14]
+; CHECK-GI-NEXT: smov w6, v7.b[0]
+; CHECK-GI-NEXT: fmov s4, w0
+; CHECK-GI-NEXT: fmov s5, w3
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: smov w11, v1.b[15]
+; CHECK-GI-NEXT: fmov s1, w18
+; CHECK-GI-NEXT: smov w7, v7.b[1]
+; CHECK-GI-NEXT: smov w18, v7.b[6]
+; CHECK-GI-NEXT: smov w21, v16.b[4]
+; CHECK-GI-NEXT: mov v4.s[1], w1
+; CHECK-GI-NEXT: mov v5.s[1], w4
+; CHECK-GI-NEXT: fmov s6, w14
+; CHECK-GI-NEXT: mov v1.s[1], w12
+; CHECK-GI-NEXT: smov w12, v7.b[3]
+; CHECK-GI-NEXT: smov w14, v7.b[7]
+; CHECK-GI-NEXT: mov v2.s[2], w13
+; CHECK-GI-NEXT: smov w13, v7.b[2]
+; CHECK-GI-NEXT: smov w0, v7.b[8]
+; CHECK-GI-NEXT: fmov s7, w6
+; CHECK-GI-NEXT: smov w23, v16.b[12]
+; CHECK-GI-NEXT: smov w25, v3.b[4]
+; CHECK-GI-NEXT: mov v6.s[1], w17
+; CHECK-GI-NEXT: mov v4.s[2], w5
+; CHECK-GI-NEXT: mov v5.s[2], w18
+; CHECK-GI-NEXT: mov v1.s[2], w15
+; CHECK-GI-NEXT: smov w6, v16.b[0]
+; CHECK-GI-NEXT: smov w3, v16.b[1]
+; CHECK-GI-NEXT: mov v2.s[3], w16
+; CHECK-GI-NEXT: mov v7.s[1], w7
+; CHECK-GI-NEXT: smov w16, v16.b[2]
+; CHECK-GI-NEXT: smov w15, v16.b[3]
+; CHECK-GI-NEXT: smov w22, v16.b[5]
+; CHECK-GI-NEXT: smov w5, v16.b[6]
+; CHECK-GI-NEXT: smov w18, v16.b[7]
+; CHECK-GI-NEXT: smov w19, v16.b[8]
+; CHECK-GI-NEXT: smov w7, v16.b[9]
+; CHECK-GI-NEXT: smov w24, v16.b[13]
+; CHECK-GI-NEXT: smov w1, v16.b[10]
+; CHECK-GI-NEXT: smov w17, v16.b[11]
+; CHECK-GI-NEXT: smov w20, v16.b[14]
+; CHECK-GI-NEXT: smov w4, v16.b[15]
+; CHECK-GI-NEXT: fmov s16, w21
+; CHECK-GI-NEXT: smov w21, v3.b[8]
+; CHECK-GI-NEXT: smov w26, v3.b[5]
+; CHECK-GI-NEXT: fmov s17, w23
+; CHECK-GI-NEXT: smov w23, v3.b[0]
+; CHECK-GI-NEXT: fmov s18, w25
+; CHECK-GI-NEXT: smov w25, v3.b[3]
+; CHECK-GI-NEXT: mov v16.s[1], w22
+; CHECK-GI-NEXT: smov w22, v3.b[1]
+; CHECK-GI-NEXT: fmov s19, w6
+; CHECK-GI-NEXT: mov v17.s[1], w24
+; CHECK-GI-NEXT: smov w24, v3.b[2]
+; CHECK-GI-NEXT: smov w6, v3.b[7]
+; CHECK-GI-NEXT: mul w0, w0, w21
+; CHECK-GI-NEXT: mov v18.s[1], w26
+; CHECK-GI-NEXT: smov w26, v3.b[6]
+; CHECK-GI-NEXT: fmov s3, w19
+; CHECK-GI-NEXT: fmov s20, w23
+; CHECK-GI-NEXT: mov v19.s[1], w3
+; CHECK-GI-NEXT: mov v16.s[2], w5
; CHECK-GI-NEXT: mov v0.s[1], wzr
-; CHECK-GI-NEXT: fmov s7, w13
-; CHECK-GI-NEXT: fmov s4, w10
-; CHECK-GI-NEXT: smov w10, v2.b[13]
-; CHECK-GI-NEXT: mov v5.s[1], w12
-; CHECK-GI-NEXT: smov w13, v2.b[9]
-; CHECK-GI-NEXT: fmov s6, w9
-; CHECK-GI-NEXT: fmov s16, w11
-; CHECK-GI-NEXT: smov w9, v1.b[0]
-; CHECK-GI-NEXT: mov v3.s[1], w8
-; CHECK-GI-NEXT: mov v7.s[1], w14
-; CHECK-GI-NEXT: smov w14, v2.b[6]
-; CHECK-GI-NEXT: ldrsb w12, [x1, #6]
-; CHECK-GI-NEXT: smov w8, v2.b[1]
-; CHECK-GI-NEXT: smov w11, v2.b[2]
-; CHECK-GI-NEXT: mov v0.s[2], wzr
-; CHECK-GI-NEXT: mov v16.s[1], w10
-; CHECK-GI-NEXT: smov w10, v2.b[14]
-; CHECK-GI-NEXT: mov v5.s[2], w12
-; CHECK-GI-NEXT: smov w12, v1.b[5]
-; CHECK-GI-NEXT: mov v6.s[1], w13
-; CHECK-GI-NEXT: fmov s17, w9
-; CHECK-GI-NEXT: mov v7.s[2], w14
-; CHECK-GI-NEXT: smov w14, v1.b[4]
-; CHECK-GI-NEXT: smov w9, v2.b[10]
-; CHECK-GI-NEXT: mov v4.s[1], w8
-; CHECK-GI-NEXT: smov w8, v1.b[1]
-; CHECK-GI-NEXT: smov w13, v2.b[7]
-; CHECK-GI-NEXT: mov v16.s[2], w10
-; CHECK-GI-NEXT: smov w10, v2.b[15]
-; CHECK-GI-NEXT: mov v0.s[3], wzr
-; CHECK-GI-NEXT: fmov s18, w14
-; CHECK-GI-NEXT: mov v6.s[2], w9
-; CHECK-GI-NEXT: smov w9, v1.b[12]
-; CHECK-GI-NEXT: mov v4.s[2], w11
-; CHECK-GI-NEXT: ldrsb w11, [x1, #7]
-; CHECK-GI-NEXT: mov v17.s[1], w8
-; CHECK-GI-NEXT: ldrsb w8, [x1, #2]
-; CHECK-GI-NEXT: mov v16.s[3], w10
-; CHECK-GI-NEXT: smov w10, v1.b[13]
-; CHECK-GI-NEXT: mov v18.s[1], w12
-; CHECK-GI-NEXT: smov w12, v1.b[6]
-; CHECK-GI-NEXT: mov v5.s[3], w11
-; CHECK-GI-NEXT: ldrsb w11, [x0, #16]!
-; CHECK-GI-NEXT: mov v7.s[3], w13
-; CHECK-GI-NEXT: smov w13, v1.b[2]
-; CHECK-GI-NEXT: fmov s20, w9
-; CHECK-GI-NEXT: ldrsb w9, [x0, #5]
-; CHECK-GI-NEXT: mov v3.s[2], w8
-; CHECK-GI-NEXT: smov w8, v1.b[8]
-; CHECK-GI-NEXT: fmov s22, w11
-; CHECK-GI-NEXT: mov v18.s[2], w12
-; CHECK-GI-NEXT: ldrsb w12, [x0, #4]
-; CHECK-GI-NEXT: smov w11, v2.b[3]
-; CHECK-GI-NEXT: mov v20.s[1], w10
-; CHECK-GI-NEXT: ldrsb w10, [x0, #8]
-; CHECK-GI-NEXT: fmov s21, w12
-; CHECK-GI-NEXT: ldrsb w12, [x1, #8]
-; CHECK-GI-NEXT: mov v17.s[2], w13
-; CHECK-GI-NEXT: smov w13, v1.b[9]
-; CHECK-GI-NEXT: fmov s19, w8
-; CHECK-GI-NEXT: smov w8, v1.b[14]
-; CHECK-GI-NEXT: mul w10, w12, w10
-; CHECK-GI-NEXT: smov w12, v1.b[7]
+; CHECK-GI-NEXT: mov v6.s[2], w10
+; CHECK-GI-NEXT: fmov s21, w0
+; CHECK-GI-NEXT: mov v17.s[2], w20
; CHECK-GI-NEXT: mov v4.s[3], w11
-; CHECK-GI-NEXT: mov v21.s[1], w9
-; CHECK-GI-NEXT: ldrsb w9, [x0, #6]
-; CHECK-GI-NEXT: mov v19.s[1], w13
-; CHECK-GI-NEXT: ldrsb w13, [x0, #1]
-; CHECK-GI-NEXT: mov v20.s[2], w8
-; CHECK-GI-NEXT: smov w8, v1.b[10]
-; CHECK-GI-NEXT: mov v18.s[3], w12
-; CHECK-GI-NEXT: ldrsb w12, [x0, #7]
-; CHECK-GI-NEXT: mov v21.s[2], w9
-; CHECK-GI-NEXT: smov w9, v2.b[11]
-; CHECK-GI-NEXT: fmov s2, w10
-; CHECK-GI-NEXT: ldrsb w10, [x0, #2]
-; CHECK-GI-NEXT: mov v22.s[1], w13
-; CHECK-GI-NEXT: smov w13, v1.b[15]
-; CHECK-GI-NEXT: mov v2.s[1], wzr
-; CHECK-GI-NEXT: mov v19.s[2], w8
-; CHECK-GI-NEXT: smov w8, v1.b[3]
-; CHECK-GI-NEXT: mov v21.s[3], w12
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v3.s[1], w7
+; CHECK-GI-NEXT: mov v20.s[1], w22
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v18.s[2], w26
+; CHECK-GI-NEXT: mov v21.s[1], wzr
+; CHECK-GI-NEXT: mov v16.s[3], w18
+; CHECK-GI-NEXT: mov v17.s[3], w4
+; CHECK-GI-NEXT: mov v7.s[2], w13
+; CHECK-GI-NEXT: mov v5.s[3], w14
+; CHECK-GI-NEXT: mov v19.s[2], w16
+; CHECK-GI-NEXT: mov v3.s[2], w1
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: mov v20.s[2], w24
+; CHECK-GI-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v18.s[3], w6
+; CHECK-GI-NEXT: mov v21.s[2], wzr
+; CHECK-GI-NEXT: mul v2.4s, v2.4s, v16.4s
+; CHECK-GI-NEXT: mul v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT: mov v1.s[3], w8
; CHECK-GI-NEXT: mov v6.s[3], w9
-; CHECK-GI-NEXT: ldrsb w9, [x0, #3]
-; CHECK-GI-NEXT: mov v20.s[3], w13
-; CHECK-GI-NEXT: smov w13, v1.b[11]
-; CHECK-GI-NEXT: mov v22.s[2], w10
-; CHECK-GI-NEXT: ldrsb w10, [x1, #3]
-; CHECK-GI-NEXT: mul v1.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT: mov v2.s[2], wzr
-; CHECK-GI-NEXT: mov v17.s[3], w8
-; CHECK-GI-NEXT: mov v3.s[3], w10
-; CHECK-GI-NEXT: mul v5.4s, v5.4s, v21.4s
-; CHECK-GI-NEXT: mov v19.s[3], w13
-; CHECK-GI-NEXT: mul v7.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT: mov v22.s[3], w9
-; CHECK-GI-NEXT: mov v2.s[3], wzr
-; CHECK-GI-NEXT: mla v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT: mla v7.4s, v6.4s, v19.4s
-; CHECK-GI-NEXT: mla v5.4s, v3.4s, v22.4s
-; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT: add v1.4s, v1.4s, v7.4s
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: mov v19.s[3], w15
+; CHECK-GI-NEXT: mov v3.s[3], w17
+; CHECK-GI-NEXT: mov v20.s[3], w25
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: mul v5.4s, v5.4s, v18.4s
+; CHECK-GI-NEXT: mov v21.s[3], wzr
+; CHECK-GI-NEXT: mla v2.4s, v1.4s, v19.4s
+; CHECK-GI-NEXT: mla v4.4s, v6.4s, v3.4s
+; CHECK-GI-NEXT: mla v5.4s, v7.4s, v20.4s
+; CHECK-GI-NEXT: add v0.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v4.4s
; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ldp x26, x25, [sp], #64 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
%0 = load <25 x i8>, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 7a436eddb23a6..5e278d59b6591 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldrb w8, [x0]
-; CHECK-GI-NEXT: ldrb w9, [x1]
-; CHECK-GI-NEXT: ldrb w10, [x0, #1]
-; CHECK-GI-NEXT: ldrb w11, [x1, #1]
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: ldrb w8, [x0, #2]
-; CHECK-GI-NEXT: ldrb w9, [x1, #2]
-; CHECK-GI-NEXT: mov v0.h[1], w10
-; CHECK-GI-NEXT: mov v1.h[1], w11
+; CHECK-GI-NEXT: mov b2, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v1.b[1]
+; CHECK-GI-NEXT: mov b4, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v1.b[2]
+; CHECK-GI-NEXT: fmov w8, s2
+; CHECK-GI-NEXT: fmov w9, s3
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h
@@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: ldr h1, [x1]
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
-; CHECK-GI-NEXT: add x9, x1, #2
-; CHECK-GI-NEXT: add x10, x1, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
-; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
-; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
More information about the llvm-commits
mailing list