[llvm] 2aa629d - AArch64: Enable terminal rule (#165959)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 12:49:25 PST 2025
Author: Matt Arsenault
Date: 2025-11-10T12:49:21-08:00
New Revision: 2aa629da6a611be3f8aec9e1dd6d0a7f5f8f6a23
URL: https://github.com/llvm/llvm-project/commit/2aa629da6a611be3f8aec9e1dd6d0a7f5f8f6a23
DIFF: https://github.com/llvm/llvm-project/commit/2aa629da6a611be3f8aec9e1dd6d0a7f5f8f6a23.diff
LOG: AArch64: Enable terminal rule (#165959)
Added:
Modified:
llvm/lib/Target/AArch64/AArch64Subtarget.h
llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
llvm/test/CodeGen/AArch64/zext-to-tbl.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 8974965c41fe3..ab4004e30f629 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -157,7 +157,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
bool enableSubRegLiveness() const override { return EnableSubregLiveness; }
-
+ bool enableTerminalRule() const override { return true; }
bool enableMachinePipeliner() const override;
bool useDFAforSMS() const override { return false; }
diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
index dbbfbea9176f6..f725c19081deb 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
@@ -188,11 +188,11 @@ entry:
define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: test11:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ld1r { v1.8b }, [x0]
-; CHECK-NEXT: ld1r { v2.8b }, [x1]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.h[2], v2.h[0]
-; CHECK-NEXT: mov v0.h[3], v1.h[0]
+; CHECK-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-NEXT: ld1r { v1.8b }, [x1]
+; CHECK-NEXT: fmov d2, d0
+; CHECK-NEXT: mov v0.h[2], v1.h[0]
+; CHECK-NEXT: mov v0.h[3], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
index 3230c9e946da7..b3a7ec961b736 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
@@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) {
; CHECK-NEXT: b.lt .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: adrp x8, A
-; CHECK-NEXT: mov w20, w19
-; CHECK-NEXT: ldr w21, [x8, :lo12:A]
+; CHECK-NEXT: mov w21, w19
+; CHECK-NEXT: ldr w20, [x8, :lo12:A]
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov w0, w21
+; CHECK-NEXT: mov w0, w20
; CHECK-NEXT: bl _Z3usei
-; CHECK-NEXT: sdiv w20, w20, w0
-; CHECK-NEXT: subs w19, w19, #1
+; CHECK-NEXT: sdiv w19, w19, w0
+; CHECK-NEXT: subs w21, w21, #1
; CHECK-NEXT: b.ne .LBB0_2
-; CHECK-NEXT: b .LBB0_4
-; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: mov w20, w19
-; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup
-; CHECK-NEXT: mov w0, w20
+; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
+; CHECK-NEXT: mov w0, w19
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
@@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) {
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov w0, w20
; CHECK-NEXT: bl _Z3usei
-; CHECK-NEXT: sdiv w21, w21, w0
-; CHECK-NEXT: subs w19, w19, #1
+; CHECK-NEXT: sdiv w19, w19, w0
+; CHECK-NEXT: subs w21, w21, #1
; CHECK-NEXT: b.ne .LBB1_2
-; CHECK-NEXT: b .LBB1_4
-; CHECK-NEXT: .LBB1_3:
-; CHECK-NEXT: mov w21, w19
-; CHECK-NEXT: .LBB1_4: // %for.cond.cleanup
+; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup
+; CHECK-NEXT: mov w0, w19
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: mov w0, w21
; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) {
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov w0, w20
; CHECK-NEXT: bl _Z3usei
-; CHECK-NEXT: sdiv w21, w21, w0
-; CHECK-NEXT: subs w19, w19, #1
+; CHECK-NEXT: sdiv w19, w19, w0
+; CHECK-NEXT: subs w21, w21, #1
; CHECK-NEXT: b.ne .LBB2_2
-; CHECK-NEXT: b .LBB2_4
-; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT: mov w21, w19
-; CHECK-NEXT: .LBB2_4: // %for.cond.cleanup
+; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup
+; CHECK-NEXT: mov w0, w19
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: mov w0, w21
; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
index e7e109170d6a1..338084295fc7f 100644
--- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
+++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll
@@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) {
; CHECK-NEXT: mov w9, wzr
; CHECK-NEXT: LBB0_1: ; %.thread
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lsr w11, w9, #1
; CHECK-NEXT: sub w10, w9, #1
-; CHECK-NEXT: mov w9, w11
+; CHECK-NEXT: lsr w9, w9, #1
; CHECK-NEXT: tbnz w10, #0, LBB0_1
; CHECK-NEXT: ; %bb.2: ; %bb343
; CHECK-NEXT: and w9, w10, #0x1
-; CHECK-NEXT: mov w0, #-1
+; CHECK-NEXT: mov w0, #-1 ; =0xffffffff
; CHECK-NEXT: str w9, [x8]
; CHECK-NEXT: ret
bb:
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index b947c943ba448..72f6646930624 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_f16
@@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: bl use_f32
@@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_f64
@@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
@@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
@@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
@@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
@@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
@@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
@@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
@@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
@@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
@@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
@@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
@@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8bf16
@@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
@@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index f2163ad15bafc..df88f37195ed6 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: mrs x19, SVCR
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: mrs x19, SVCR
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: tbz w19, #0, .LBB4_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index 6c6a691760af3..52a77cb396909 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2(<vscale x 16 x float> %arg) {
define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) {
; CHECK-LABEL: extract_v4i1_nxv32i1_0:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT: umov w8, v1.b[1]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: umov w9, v1.b[2]
+; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT: umov w8, v0.b[1]
+; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: umov w8, v1.b[2]
+; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: umov w8, v1.b[3]
-; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1(<vscale x 32 x i1> %arg, i64 0)
ret <4 x i1> %ext
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index e10313773c73e..72994100b2970 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
; CHECK-LABEL: extract_v4i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: mov w8, v1.s[2]
+; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0)
ret <4 x i1> %mask
@@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
; CHECK-LABEL: extract_v8i1_nxv8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: umov w9, v1.h[2]
+; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1
+; CHECK-NEXT: umov w8, v0.h[1]
+; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov v0.b[1], w8
+; CHECK-NEXT: umov w8, v1.h[2]
+; CHECK-NEXT: mov v0.b[2], w8
; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v0.b[2], w9
-; CHECK-NEXT: umov w9, v1.h[4]
; CHECK-NEXT: mov v0.b[3], w8
+; CHECK-NEXT: umov w8, v1.h[4]
+; CHECK-NEXT: mov v0.b[4], w8
; CHECK-NEXT: umov w8, v1.h[5]
-; CHECK-NEXT: mov v0.b[4], w9
-; CHECK-NEXT: umov w9, v1.h[6]
; CHECK-NEXT: mov v0.b[5], w8
+; CHECK-NEXT: umov w8, v1.h[6]
+; CHECK-NEXT: mov v0.b[6], w8
; CHECK-NEXT: umov w8, v1.h[7]
-; CHECK-NEXT: mov v0.b[6], w9
; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0)
ret <8 x i1> %mask
@@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
; CHECK-LABEL: extract_v16i1_nxv16i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.b[1], v1.b[1]
+; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: mov v0.b[1], v0.b[1]
; CHECK-NEXT: mov v0.b[2], v1.b[2]
; CHECK-NEXT: mov v0.b[3], v1.b[3]
; CHECK-NEXT: mov v0.b[4], v1.b[4]
@@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
; CHECK-NEXT: mov v0.b[13], v1.b[13]
; CHECK-NEXT: mov v0.b[14], v1.b[14]
; CHECK-NEXT: mov v0.b[15], v1.b[15]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1(<vscale x 16 x i1> %inmask, i64 0)
ret <16 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
index 41e4a38fad90b..8e807cda7166d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
@@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu"
define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 {
; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT: mov w8, v1.s[1]
-; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov w9, v1.s[2]
+; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov v0.h[1], w8
+; CHECK-NEXT: mov w8, v1.s[2]
+; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: mov w8, v1.s[3]
-; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%el0 = extractelement <vscale x 4 x i1> %a, i32 0
%el1 = extractelement <vscale x 4 x i1> %a, i32 1
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index ba4a3a2042305..bd8f432579a08 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
; CHECK: // %bb.0:
; CHECK-NEXT: tbnz w1, #0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %vector.body
+; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: movi v3.2d, #0000000000000000
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: umov w8, v0.b[8]
-; CHECK-NEXT: mov v1.b[1], v0.b[1]
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16
+; CHECK-NEXT: umov w8, v2.b[8]
+; CHECK-NEXT: mov v0.b[1], v2.b[1]
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #16
; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: fmov s2, w8
-; CHECK-NEXT: mov v1.b[2], v0.b[2]
-; CHECK-NEXT: mov v2.b[1], v0.b[9]
-; CHECK-NEXT: mov v1.b[3], v0.b[3]
-; CHECK-NEXT: mov v2.b[2], v0.b[10]
-; CHECK-NEXT: mov v1.b[4], v0.b[4]
-; CHECK-NEXT: mov v2.b[3], v0.b[11]
-; CHECK-NEXT: mov v1.b[5], v0.b[5]
-; CHECK-NEXT: mov v2.b[4], v0.b[12]
-; CHECK-NEXT: mov v1.b[6], v0.b[6]
-; CHECK-NEXT: mov v2.b[5], v0.b[13]
-; CHECK-NEXT: mov v1.b[7], v0.b[7]
-; CHECK-NEXT: mov v2.b[6], v0.b[14]
-; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: mov v2.b[7], v0.b[15]
-; CHECK-NEXT: uunpklo z0.h, z3.b
+; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: mov v0.b[2], v2.b[2]
+; CHECK-NEXT: mov v1.b[1], v2.b[9]
+; CHECK-NEXT: mov v0.b[3], v2.b[3]
+; CHECK-NEXT: mov v1.b[2], v2.b[10]
+; CHECK-NEXT: mov v0.b[4], v2.b[4]
+; CHECK-NEXT: mov v1.b[3], v2.b[11]
+; CHECK-NEXT: mov v0.b[5], v2.b[5]
+; CHECK-NEXT: mov v1.b[4], v2.b[12]
+; CHECK-NEXT: mov v0.b[6], v2.b[6]
+; CHECK-NEXT: mov v1.b[5], v2.b[13]
+; CHECK-NEXT: mov v0.b[7], v2.b[7]
+; CHECK-NEXT: mov v1.b[6], v2.b[14]
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: mov v1.b[7], v2.b[15]
+; CHECK-NEXT: uunpklo z2.h, z3.b
; CHECK-NEXT: uunpklo z3.h, z4.b
-; CHECK-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEXT: uunpklo z2.h, z2.b
; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: lsl z1.s, z1.s, #31
+; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: lsl z0.s, z0.s, #31
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: lsl z2.s, z2.s, #31
; CHECK-NEXT: lsl z3.s, z3.s, #31
-; CHECK-NEXT: asr z1.s, z1.s, #31
; CHECK-NEXT: asr z0.s, z0.s, #31
+; CHECK-NEXT: asr z2.s, z2.s, #31
; CHECK-NEXT: asr z3.s, z3.s, #31
-; CHECK-NEXT: lsl z2.s, z2.s, #31
-; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0
-; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: lsl z1.s, z1.s, #31
+; CHECK-NEXT: cmpne p3.s, p0/z, z0.s, #0
+; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0
+; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT: asr z2.s, z2.s, #31
-; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0
-; CHECK-NEXT: st1w { z0.s }, p1, [x0, #2, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p2, [x0, #3, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p3, [x0]
-; CHECK-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-NEXT: asr z1.s, z1.s, #31
+; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT: st1w { z2.s }, p1, [x0, #2, mul vl]
+; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl]
+; CHECK-NEXT: st1w { z2.s }, p3, [x0]
+; CHECK-NEXT: st1w { z2.s }, p0, [x0, #1, mul vl]
; CHECK-NEXT: .LBB1_2: // %exit
; CHECK-NEXT: ret
%broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index 124f81e7864d1..39fe92aae0619 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
; CHECK-NEXT: whilelt p0.s, wzr, w0
; CHECK-NEXT: b.pl .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: cntw x9
+; CHECK-NEXT: mov w9, wzr
+; CHECK-NEXT: cntw x8
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: whilelt p0.s, w8, w0
-; CHECK-NEXT: add w8, w8, w9
+; CHECK-NEXT: whilelt p0.s, w9, w0
+; CHECK-NEXT: add w9, w9, w8
; CHECK-NEXT: b.mi .LBB0_2
; CHECK-NEXT: .LBB0_3: // %exit
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 74a717f1635a3..935189dec48ac 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB24_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT: add x8, x0, #16
+; CHECK-BE-NEXT: mov x8, x0
; CHECK-BE-NEXT: ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT: ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT: add x9, x0, #48
-; CHECK-BE-NEXT: add x10, x0, #32
+; CHECK-BE-NEXT: add x0, x0, #16
+; CHECK-BE-NEXT: add x9, x8, #48
+; CHECK-BE-NEXT: ld1 { v3.8h }, [x0]
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
@@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h
; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h
; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
-; CHECK-BE-NEXT: mov x0, x8
+; CHECK-BE-NEXT: st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x8, #32
; CHECK-BE-NEXT: st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT: st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x0]
; CHECK-BE-NEXT: b.ne .LBB24_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
@@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB25_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v4.16b }, [x0]
-; CHECK-BE-NEXT: add x9, x1, #48
-; CHECK-BE-NEXT: add x8, x1, #32
-; CHECK-BE-NEXT: ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT: add x10, x1, #48
; CHECK-BE-NEXT: ld1 { v16.4s }, [x1]
+; CHECK-BE-NEXT: add x9, x1, #32
+; CHECK-BE-NEXT: ld1 { v18.4s }, [x10]
; CHECK-BE-NEXT: add x1, x1, #16
-; CHECK-BE-NEXT: ld1 { v20.4s }, [x8]
+; CHECK-BE-NEXT: ld1 { v20.4s }, [x9]
; CHECK-BE-NEXT: ld1 { v22.4s }, [x1]
-; CHECK-BE-NEXT: add x8, x0, #96
+; CHECK-BE-NEXT: add x9, x0, #96
; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b
; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b
; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8
-; CHECK-BE-NEXT: add x9, x0, #32
+; CHECK-BE-NEXT: mov x8, x0
; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8
-; CHECK-BE-NEXT: add x10, x0, #16
+; CHECK-BE-NEXT: add x10, x0, #32
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8
; CHECK-BE-NEXT: rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8
; CHECK-BE-NEXT: rev32 v21.8b, v7.8b
; CHECK-BE-NEXT: rev32 v23.8b, v4.8b
; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8
@@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: rev32 v4.8b, v4.8b
; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s
; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s
-; CHECK-BE-NEXT: st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT: st1 { v5.2d }, [x9]
; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s
; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s
-; CHECK-BE-NEXT: add x8, x0, #112
+; CHECK-BE-NEXT: add x9, x0, #112
; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s
-; CHECK-BE-NEXT: st1 { v18.2d }, [x9]
-; CHECK-BE-NEXT: add x9, x0, #80
+; CHECK-BE-NEXT: st1 { v18.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x0, #80
; CHECK-BE-NEXT: st1 { v22.2d }, [x0]
-; CHECK-BE-NEXT: st1 { v17.2d }, [x8]
-; CHECK-BE-NEXT: add x8, x0, #64
-; CHECK-BE-NEXT: st1 { v19.2d }, [x9]
-; CHECK-BE-NEXT: add x9, x0, #48
-; CHECK-BE-NEXT: mov x0, x8
-; CHECK-BE-NEXT: st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT: add x0, x0, #64
+; CHECK-BE-NEXT: st1 { v17.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x8, #48
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: st1 { v19.2d }, [x10]
+; CHECK-BE-NEXT: st1 { v5.2d }, [x0]
; CHECK-BE-NEXT: st1 { v6.2d }, [x9]
-; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: st1 { v4.2d }, [x8]
; CHECK-BE-NEXT: b.ne .LBB25_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
@@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB26_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16
-; CHECK-BE-NEXT: add x8, x0, #32
+; CHECK-BE-NEXT: mov x8, x0
+; CHECK-BE-NEXT: add x9, x0, #32
; CHECK-BE-NEXT: ld1 { v16.4s }, [x0]
-; CHECK-BE-NEXT: add x9, x0, #48
-; CHECK-BE-NEXT: add x10, x0, #16
-; CHECK-BE-NEXT: ld1 { v17.4s }, [x8]
-; CHECK-BE-NEXT: ld1 { v18.4s }, [x9]
-; CHECK-BE-NEXT: ld1 { v19.4s }, [x10]
+; CHECK-BE-NEXT: add x10, x0, #48
+; CHECK-BE-NEXT: add x0, x0, #16
+; CHECK-BE-NEXT: ld1 { v17.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v18.4s }, [x10]
+; CHECK-BE-NEXT: ld1 { v19.4s }, [x0]
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b
@@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s
; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s
; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s
-; CHECK-BE-NEXT: st1 { v5.4s }, [x0]
-; CHECK-BE-NEXT: mov x0, x10
-; CHECK-BE-NEXT: st1 { v6.4s }, [x8]
-; CHECK-BE-NEXT: st1 { v7.4s }, [x9]
-; CHECK-BE-NEXT: st1 { v4.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v5.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v6.4s }, [x9]
+; CHECK-BE-NEXT: st1 { v7.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
; CHECK-BE-NEXT: b.ne .LBB26_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
@@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: .LBB28_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16
-; CHECK-BE-NEXT: add x8, x0, #16
+; CHECK-BE-NEXT: mov x8, x0
; CHECK-BE-NEXT: ld1 { v1.8h }, [x0]
-; CHECK-BE-NEXT: ld1 { v3.8h }, [x8]
-; CHECK-BE-NEXT: add x9, x0, #48
-; CHECK-BE-NEXT: add x10, x0, #32
+; CHECK-BE-NEXT: add x0, x0, #16
+; CHECK-BE-NEXT: add x9, x8, #48
+; CHECK-BE-NEXT: ld1 { v3.8h }, [x0]
; CHECK-BE-NEXT: subs w2, w2, #1
; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
@@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) {
; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h
; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h
; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT: st1 { v4.4s }, [x0]
-; CHECK-BE-NEXT: mov x0, x8
+; CHECK-BE-NEXT: st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x8, #32
; CHECK-BE-NEXT: st1 { v5.4s }, [x9]
-; CHECK-BE-NEXT: st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x0]
; CHECK-BE-NEXT: b.ne .LBB28_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: mov w0, wzr
More information about the llvm-commits
mailing list