[llvm-branch-commits] [llvm] [AArch64][ISel] Select constructive EXT_ZZZI pseudo instruction (PR #152554)

Gaƫtan Bossu via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Aug 7 10:19:17 PDT 2025


https://github.com/gbossu created https://github.com/llvm/llvm-project/pull/152554

The patch changes existing patterns to select the EXT_ZZZI pseudo
instead of the EXT_ZZI destructive instruction for vector_splice.
    
Given that registers aren't tied anymore, this gives the register
allocator more freedom and a lot of MOVs get replaced with MOVPRFX.
    
In some cases however, we could have just chosen the same input and
output register, but regalloc preferred not to. This means we end up
with some test cases now having more instructions: there is now a
MOVPRFX while no MOV was previously needed.

>From f07ab3218a15f36ab04fc70f4a78134ff477c3eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Wed, 6 Aug 2025 16:12:02 +0000
Subject: [PATCH 1/2] [AArch64][ISel] Select constructive EXT_ZZZI pseudo
 instruction

The patch changes existing patterns to select the EXT_ZZZI pseudo
instead of the EXT_ZZI destructive instruction for vector_splice.

Given that registers aren't tied anymore, this gives the register
allocator more freedom and a lot of MOVs get replaced with MOVPRFX.

In some cases however, we could have just chosen the same input and
output register, but regalloc preferred not to. This means we end up
with some test cases now having more instructions: there is now a
MOVPRFX while no MOV was previously needed.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   8 +-
 .../AArch64/named-vector-shuffles-sve.ll      |  36 +-
 .../sve-fixed-length-extract-subvector.ll     |  41 +-
 .../AArch64/sve-fixed-length-fp-to-int.ll     |  44 +-
 .../AArch64/sve-fixed-length-int-extends.ll   |  54 +-
 .../AArch64/sve-fixed-length-int-rem.ll       |  40 +-
 .../AArch64/sve-fixed-length-int-to-fp.ll     |  44 +-
 .../AArch64/sve-fixed-length-limit-duplane.ll |  14 +-
 .../AArch64/sve-fixed-length-masked-loads.ll  | 126 +--
 .../sve-fixed-length-partial-reduce.ll        |  28 +-
 .../AArch64/sve-fixed-length-shuffles.ll      |  41 +-
 llvm/test/CodeGen/AArch64/sve-pr92779.ll      |  18 +-
 ...aming-mode-fixed-length-fp-extend-trunc.ll |  27 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 286 +++----
 ...streaming-mode-fixed-length-int-extends.ll | 740 ++++++++++--------
 ...sve-streaming-mode-fixed-length-int-rem.ll | 216 ++---
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 284 +++----
 ...reaming-mode-fixed-length-limit-duplane.ll |  15 +-
 ...streaming-mode-fixed-length-masked-load.ll |  26 +-
 ...treaming-mode-fixed-length-masked-store.ll |  38 +-
 ...-streaming-mode-fixed-length-reductions.ll |  94 ++-
 21 files changed, 1197 insertions(+), 1023 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 85e647af6684c..a3ca0cb73cd43 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2135,19 +2135,19 @@ let Predicates = [HasSVE_or_SME] in {
   // Splice with lane bigger or equal to 0
   foreach VT = [nxv16i8] in
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_255 i32:$index)))),
-              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+              (EXT_ZZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
 
   foreach VT = [nxv8i16, nxv8f16, nxv8bf16] in
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_127 i32:$index)))),
-              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+              (EXT_ZZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
 
   foreach VT = [nxv4i32, nxv4f16, nxv4f32, nxv4bf16] in
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_63 i32:$index)))),
-              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+              (EXT_ZZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
 
   foreach VT = [nxv2i64, nxv2f16, nxv2f32, nxv2f64, nxv2bf16] in
     def : Pat<(VT (vector_splice VT:$Z1, VT:$Z2, (i64 (sve_ext_imm_0_31 i32:$index)))),
-              (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+              (EXT_ZZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
 
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 69e805d9ca2ee..96311c2849f7c 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -256,12 +256,13 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
 define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
   ret <vscale x 2 x i1> %res
@@ -271,12 +272,13 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    and z1.s, z1.s, #0x1
-; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    and z0.s, z0.s, #0x1
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
   ret <vscale x 4 x i1> %res
@@ -286,12 +288,13 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    and z1.h, z1.h, #0x1
-; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    and z0.h, z0.h, #0x1
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
   ret <vscale x 8 x i1> %res
@@ -301,12 +304,13 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    and z1.b, z1.b, #0x1
-; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    and z0.b, z0.b, #0x1
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
   ret <vscale x 16 x i1> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index 800f95d97af4c..7b438743487e1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -50,7 +50,7 @@ define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
 ; CHECK-LABEL: extract_v32i8_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -68,7 +68,7 @@ define void @extract_v32i8_half_unaligned(ptr %in, ptr %out) #0 vscale_range(2,2
 ; CHECK-LABEL: extract_v32i8_half_unaligned:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #4
 ; CHECK-NEXT:    str q0, [x1]
@@ -84,15 +84,16 @@ define void @extract_v32i8_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr
 ; CHECK-LABEL: extract_v32i8_quarters:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #24
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    str d1, [x1]
 ; CHECK-NEXT:    str d2, [x2]
 ; CHECK-NEXT:    str d0, [x3]
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    str d0, [x4]
+; CHECK-NEXT:    str d3, [x4]
 ; CHECK-NEXT:    ret
 entry:
   %b = load <32 x i8>, ptr %in
@@ -126,7 +127,7 @@ define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl32
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1b { z1.b }, p0, [x1]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
@@ -207,7 +208,7 @@ define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range
 ; CHECK-LABEL: extract_v16i16_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -240,7 +241,7 @@ define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
@@ -322,7 +323,7 @@ define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
 ; CHECK-LABEL: extract_v8i32_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -355,7 +356,7 @@ define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
@@ -426,7 +427,7 @@ define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
 ; CHECK-LABEL: extract_v4i64_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -459,7 +460,7 @@ define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
@@ -553,7 +554,7 @@ define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang
 ; CHECK-LABEL: extract_v16half_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -586,7 +587,7 @@ define void @extract_v32half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
@@ -668,7 +669,7 @@ define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_rang
 ; CHECK-LABEL: extract_v8float_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -701,7 +702,7 @@ define void @extract_v16float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
@@ -772,7 +773,7 @@ define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran
 ; CHECK-LABEL: extract_v4double_halves:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    str q1, [x1]
 ; CHECK-NEXT:    str q0, [x2]
@@ -805,7 +806,7 @@ define void @extract_v8double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_ran
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr z0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
 ; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
@@ -908,7 +909,7 @@ define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 {
 ; CHECK-NEXT:    add x8, x8, :lo12:.LCPI59_0
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ldr z0, [x8]
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    cmeq v1.4s, v1.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index af54b146c5b66..c8f6d98f5a63f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -150,13 +150,14 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32:
@@ -551,13 +552,14 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64:
@@ -1043,13 +1045,14 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    fcvtzs z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    fcvtzs z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32:
@@ -1444,13 +1447,14 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
index 4feb86305f8f6..d2fa65599b973 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
@@ -77,11 +77,12 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
-; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: sext_v32i8_v32i16:
@@ -326,11 +327,12 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: sext_v16i16_v16i32:
@@ -490,11 +492,12 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: sext_v8i32_v8i64:
@@ -573,11 +576,12 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
-; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: zext_v32i8_v32i16:
@@ -822,11 +826,12 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: zext_v16i16_v16i32:
@@ -986,11 +991,12 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: zext_v8i32_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 2d78945399176..27be84419d59e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -259,17 +259,17 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z5.h, z5.b
 ; CHECK-NEXT:    sunpklo z7.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
-; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
@@ -420,11 +420,11 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    mov z4.d, z0.d
+; VBITS_GE_256-NEXT:    movprfx z4, z0
 ; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    mov z3.d, z1.d
 ; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    movprfx z3, z1
 ; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
@@ -507,11 +507,11 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    movprfx z4, z0
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
-; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
@@ -1077,17 +1077,17 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z5.h, z5.b
 ; CHECK-NEXT:    uunpklo z7.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
-; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
@@ -1238,11 +1238,11 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    mov z4.d, z0.d
+; VBITS_GE_256-NEXT:    movprfx z4, z0
 ; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
-; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    mov z3.d, z1.d
 ; VBITS_GE_256-NEXT:    uunpklo z4.s, z4.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    movprfx z3, z1
 ; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
@@ -1325,11 +1325,11 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    movprfx z4, z0
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
-; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 1d9e01f4ecfdf..5753e5972f9c8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -150,13 +150,14 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ucvtf_v16i16_v16f32:
@@ -554,13 +555,14 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ucvtf_v8i32_v8f64:
@@ -1063,13 +1065,14 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: scvtf_v16i16_v16f32:
@@ -1479,13 +1482,14 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: scvtf_v8i32_v8f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
index 78b41f71f0ea2..8b584c448bbae 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
@@ -8,10 +8,11 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; CHECK-NEXT:    add z1.s, z0.s, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #16
+; CHECK-NEXT:    add z1.s, z1.s, z1.s
 ; CHECK-NEXT:    add z2.s, z2.s, z2.s
 ; CHECK-NEXT:    dup v0.4s, v0.s[2]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
@@ -31,10 +32,11 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; CHECK-NEXT:    add z1.s, z0.s, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #24
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #24
+; CHECK-NEXT:    add z1.s, z1.s, z1.s
 ; CHECK-NEXT:    add z2.s, z2.s, z2.s
 ; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index 34dc0bb5ef2d2..864a9f7987bdb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -375,11 +375,12 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
@@ -476,11 +477,12 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
@@ -542,11 +544,12 @@ define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
@@ -574,11 +577,12 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
@@ -675,11 +679,12 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
@@ -741,11 +746,12 @@ define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
@@ -782,11 +788,12 @@ define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
 ; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
-; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16:
@@ -915,11 +922,12 @@ define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
 ; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32:
@@ -1000,11 +1008,12 @@ define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64:
@@ -1041,11 +1050,12 @@ define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
 ; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
-; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
+; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16:
@@ -1174,11 +1184,12 @@ define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
 ; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32:
@@ -1259,11 +1270,12 @@ define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64:
@@ -1495,11 +1507,12 @@ define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64:
@@ -1527,11 +1540,12 @@ define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    cmpgt p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
+; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index 33d5ac4cd299e..cc19f6c2cbbc8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -100,7 +100,7 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SVE-NEXT:    uunpkhi z1.h, z1.b
 ; SVE-NEXT:    mad z2.h, p0/m, z3.h, z4.h
 ; SVE-NEXT:    mad z0.h, p0/m, z1.h, z2.h
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -113,7 +113,7 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SME-NEXT:    ldr z2, [x2]
 ; SME-NEXT:    umlalb z0.h, z2.b, z1.b
 ; SME-NEXT:    umlalt z0.h, z2.b, z1.b
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -223,7 +223,7 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SVE-NEXT:    uunpkhi z1.s, z1.h
 ; SVE-NEXT:    mad z2.s, p0/m, z3.s, z4.s
 ; SVE-NEXT:    mad z0.s, p0/m, z1.s, z2.s
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -236,7 +236,7 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SME-NEXT:    ldr z2, [x2]
 ; SME-NEXT:    umlalb z0.s, z2.h, z1.h
 ; SME-NEXT:    umlalt z0.s, z2.h, z1.h
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -346,7 +346,7 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SVE-NEXT:    uunpkhi z1.d, z1.s
 ; SVE-NEXT:    mad z2.d, p0/m, z3.d, z4.d
 ; SVE-NEXT:    mad z0.d, p0/m, z1.d, z2.d
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -359,7 +359,7 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SME-NEXT:    ldr z2, [x2]
 ; SME-NEXT:    umlalb z0.d, z2.s, z1.s
 ; SME-NEXT:    umlalt z0.d, z2.s, z1.s
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -635,7 +635,7 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SVE-NEXT:    ldr z1, [x1]
 ; SVE-NEXT:    ldr z2, [x2]
 ; SVE-NEXT:    udot z0.s, z2.b, z1.b
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -647,7 +647,7 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SME-NEXT:    ldr z1, [x1]
 ; SME-NEXT:    ldr z2, [x2]
 ; SME-NEXT:    udot z0.s, z2.b, z1.b
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -680,7 +680,7 @@ define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr)
 ; SVE-NEXT:    ldr z1, [x1]
 ; SVE-NEXT:    ldr z2, [x2]
 ; SVE-NEXT:    usdot z0.s, z1.b, z2.b
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -692,7 +692,7 @@ define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr)
 ; SME-NEXT:    ldr z1, [x1]
 ; SME-NEXT:    ldr z2, [x2]
 ; SME-NEXT:    usdot z0.s, z1.b, z2.b
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -813,7 +813,7 @@ define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vsca
 ; SVE-NEXT:    ldr z1, [x1]
 ; SVE-NEXT:    ldr z2, [x2]
 ; SVE-NEXT:    udot z0.d, z2.h, z1.h
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -825,7 +825,7 @@ define <4 x i64> @four_way_i16_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vsca
 ; SME-NEXT:    ldr z1, [x1]
 ; SME-NEXT:    ldr z2, [x2]
 ; SME-NEXT:    udot z0.d, z2.h, z1.h
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -984,7 +984,7 @@ define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SVE-NEXT:    uunpkhi z0.d, z0.s
 ; SVE-NEXT:    add z1.d, z2.d, z1.d
 ; SVE-NEXT:    add z0.d, z1.d, z0.d
-; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    movprfx z1, z0
 ; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
@@ -999,7 +999,7 @@ define <4 x i64> @four_way_i8_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
 ; SME-NEXT:    ldr z0, [x0]
 ; SME-NEXT:    uaddwb z0.d, z0.d, z2.s
 ; SME-NEXT:    uaddwt z0.d, z0.d, z2.s
-; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    movprfx z1, z0
 ; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index 2eff6da0866f8..ba4a3a2042305 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -12,8 +12,8 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.s, s0
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #16
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    st2 { v0.4s, v1.4s }, [x0]
 ; CHECK-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> poison, <8 x i32> zeroinitializer
@@ -33,6 +33,9 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    umov w8, v0.b[8]
 ; CHECK-NEXT:    mov v1.b[1], v0.b[1]
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #16
+; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
 ; CHECK-NEXT:    fmov s2, w8
 ; CHECK-NEXT:    mov v1.b[2], v0.b[2]
 ; CHECK-NEXT:    mov v2.b[1], v0.b[9]
@@ -48,32 +51,30 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    mov v2.b[6], v0.b[14]
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    mov v2.b[7], v0.b[15]
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT:    uunpklo z0.h, z3.b
+; CHECK-NEXT:    uunpklo z3.h, z4.b
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    lsl z1.s, z1.s, #31
-; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
-; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
-; CHECK-NEXT:    lsl z2.s, z2.s, #31
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    lsl z3.s, z3.s, #31
-; CHECK-NEXT:    asr z2.s, z2.s, #31
-; CHECK-NEXT:    st1w { z1.s }, p1, [x0]
-; CHECK-NEXT:    cmpne p2.s, p0/z, z0.s, #0
+; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    asr z3.s, z3.s, #31
-; CHECK-NEXT:    cmpne p3.s, p0/z, z3.s, #0
+; CHECK-NEXT:    lsl z2.s, z2.s, #31
+; CHECK-NEXT:    cmpne p3.s, p0/z, z1.s, #0
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
+; CHECK-NEXT:    asr z2.s, z2.s, #31
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z2.s, #0
-; CHECK-NEXT:    st1w { z1.s }, p2, [x0, #2, mul vl]
-; CHECK-NEXT:    st1w { z1.s }, p3, [x0, #3, mul vl]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0, #2, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p2, [x0, #3, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p3, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0, #1, mul vl]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll
index 3f34d79b3bb49..bce2d4bede385 100644
--- a/llvm/test/CodeGen/AArch64/sve-pr92779.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll
@@ -5,16 +5,16 @@ define void @main(ptr %0) {
 ; CHECK-LABEL: main:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    ptrue p0.d, vl1
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uzp1 v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    neg v0.2s, v0.2s
-; CHECK-NEXT:    smov x8, v0.s[0]
-; CHECK-NEXT:    smov x9, v0.s[1]
-; CHECK-NEXT:    mov z1.d, p0/m, x8
-; CHECK-NEXT:    mov z1.d, p0/m, x9
-; CHECK-NEXT:    str z1, [x0]
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    uzp1 v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    smov x8, v1.s[0]
+; CHECK-NEXT:    smov x9, v1.s[1]
+; CHECK-NEXT:    mov z0.d, p0/m, x8
+; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
 "entry":
   %1 = bitcast <vscale x 2 x i64> zeroinitializer to <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 56149e99b15f8..6951a6cd50ef6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -75,13 +75,14 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v8f16_to_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32:
@@ -124,19 +125,21 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
-; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
-; CHECK-NEXT:    stp q3, q0, [x0]
-; CHECK-NEXT:    stp q2, q1, [x0, #32]
+; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
+; CHECK-NEXT:    stp q0, q3, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 94d756a36ab92..66e157c779abf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -262,12 +262,13 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32:
@@ -320,18 +321,20 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    fcvtzu z2.s, p0/m, z2.h
-; CHECK-NEXT:    fcvtzu z3.s, p0/m, z3.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.h
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    fcvtzu z2.s, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.s, p0/m, z3.h
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32:
@@ -521,27 +524,27 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    mov z4.h, z0.h[1]
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
 ; CHECK-NEXT:    mov z5.h, z1.h[3]
 ; CHECK-NEXT:    mov z6.h, z1.h[2]
 ; CHECK-NEXT:    mov z7.h, z1.h[1]
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
-; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
 ; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
-; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
 ; CHECK-NEXT:    zip1 z3.d, z6.d, z5.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z7.d
+; CHECK-NEXT:    stp q0, q2, [x1]
 ; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
@@ -597,52 +600,53 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z3.h, z1.h[1]
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    mov z6.h, z0.h[2]
-; CHECK-NEXT:    mov z16.d, z0.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z1.h
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    mov z4.h, z0.h[2]
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.h
+; CHECK-NEXT:    mov z16.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
 ; CHECK-NEXT:    mov z17.h, z0.h[1]
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z6, z0
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
-; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
-; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z16.d, p0/m, z16.h
+; CHECK-NEXT:    mov z18.h, z5.h[3]
 ; CHECK-NEXT:    fcvtzu z17.d, p0/m, z17.h
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
-; CHECK-NEXT:    mov z20.h, z1.h[3]
-; CHECK-NEXT:    mov z18.h, z16.h[3]
-; CHECK-NEXT:    mov z19.h, z16.h[2]
-; CHECK-NEXT:    mov z21.h, z16.h[1]
-; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    mov z3.h, z1.h[2]
-; CHECK-NEXT:    zip1 z5.d, z6.d, z5.d
-; CHECK-NEXT:    mov z6.h, z1.h[1]
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    mov z19.h, z6.h[3]
+; CHECK-NEXT:    mov z20.h, z6.h[2]
+; CHECK-NEXT:    mov z21.h, z6.h[1]
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT:    mov z7.h, z5.h[2]
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    mov z4.h, z5.h[1]
+; CHECK-NEXT:    fcvtzu z19.d, p0/m, z19.h
+; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzu z20.d, p0/m, z20.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
-; CHECK-NEXT:    fcvtzu z16.d, p0/m, z16.h
-; CHECK-NEXT:    fcvtzu z18.d, p0/m, z18.h
 ; CHECK-NEXT:    movprfx z17, z21
 ; CHECK-NEXT:    fcvtzu z17.d, p0/m, z21.h
-; CHECK-NEXT:    fcvtzu z19.d, p0/m, z19.h
-; CHECK-NEXT:    zip1 z4.d, z7.d, z4.d
-; CHECK-NEXT:    movprfx z7, z20
-; CHECK-NEXT:    fcvtzu z7.d, p0/m, z20.h
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
-; CHECK-NEXT:    stp q0, q5, [x1, #64]
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
-; CHECK-NEXT:    zip1 z0.d, z19.d, z18.d
-; CHECK-NEXT:    zip1 z5.d, z16.d, z17.d
-; CHECK-NEXT:    stp q2, q4, [x1]
-; CHECK-NEXT:    zip1 z2.d, z3.d, z7.d
-; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
-; CHECK-NEXT:    stp q5, q0, [x1, #96]
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
+; CHECK-NEXT:    zip1 z1.d, z1.d, z16.d
+; CHECK-NEXT:    movprfx z16, z18
+; CHECK-NEXT:    fcvtzu z16.d, p0/m, z18.h
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
+; CHECK-NEXT:    stp q0, q3, [x1, #64]
+; CHECK-NEXT:    zip1 z0.d, z20.d, z19.d
+; CHECK-NEXT:    zip1 z3.d, z6.d, z17.d
+; CHECK-NEXT:    stp q2, q1, [x1]
+; CHECK-NEXT:    zip1 z1.d, z7.d, z16.d
+; CHECK-NEXT:    zip1 z2.d, z5.d, z4.d
+; CHECK-NEXT:    stp q3, q0, [x1, #96]
+; CHECK-NEXT:    stp q2, q1, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -1068,12 +1072,13 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.s
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64:
@@ -1106,18 +1111,20 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.s
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.s
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.s
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64:
@@ -1957,12 +1964,13 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32:
@@ -2015,18 +2023,20 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z2.h
-; CHECK-NEXT:    fcvtzs z3.s, p0/m, z3.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.h
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    fcvtzs z2.s, p0/m, z2.h
+; CHECK-NEXT:    fcvtzs z3.s, p0/m, z3.h
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32:
@@ -2217,27 +2227,27 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    mov z4.h, z0.h[1]
+; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
 ; CHECK-NEXT:    mov z5.h, z1.h[3]
 ; CHECK-NEXT:    mov z6.h, z1.h[2]
 ; CHECK-NEXT:    mov z7.h, z1.h[1]
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
-; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
 ; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
 ; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
-; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
 ; CHECK-NEXT:    zip1 z3.d, z6.d, z5.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z7.d
+; CHECK-NEXT:    stp q0, q2, [x1]
 ; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
@@ -2293,52 +2303,53 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z3.h, z1.h[1]
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    mov z6.h, z0.h[2]
-; CHECK-NEXT:    mov z16.d, z0.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.h
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    mov z4.h, z0.h[2]
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.h
+; CHECK-NEXT:    mov z16.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
 ; CHECK-NEXT:    mov z17.h, z0.h[1]
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z6, z0
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
-; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z16.h
+; CHECK-NEXT:    mov z18.h, z5.h[3]
 ; CHECK-NEXT:    fcvtzs z17.d, p0/m, z17.h
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
-; CHECK-NEXT:    mov z20.h, z1.h[3]
-; CHECK-NEXT:    mov z18.h, z16.h[3]
-; CHECK-NEXT:    mov z19.h, z16.h[2]
-; CHECK-NEXT:    mov z21.h, z16.h[1]
-; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    mov z3.h, z1.h[2]
-; CHECK-NEXT:    zip1 z5.d, z6.d, z5.d
-; CHECK-NEXT:    mov z6.h, z1.h[1]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    mov z19.h, z6.h[3]
+; CHECK-NEXT:    mov z20.h, z6.h[2]
+; CHECK-NEXT:    mov z21.h, z6.h[1]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    zip1 z2.d, z7.d, z2.d
+; CHECK-NEXT:    mov z7.h, z5.h[2]
+; CHECK-NEXT:    zip1 z3.d, z4.d, z3.d
+; CHECK-NEXT:    mov z4.h, z5.h[1]
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z19.h
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzs z20.d, p0/m, z20.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
-; CHECK-NEXT:    fcvtzs z16.d, p0/m, z16.h
-; CHECK-NEXT:    fcvtzs z18.d, p0/m, z18.h
 ; CHECK-NEXT:    movprfx z17, z21
 ; CHECK-NEXT:    fcvtzs z17.d, p0/m, z21.h
-; CHECK-NEXT:    fcvtzs z19.d, p0/m, z19.h
-; CHECK-NEXT:    zip1 z4.d, z7.d, z4.d
-; CHECK-NEXT:    movprfx z7, z20
-; CHECK-NEXT:    fcvtzs z7.d, p0/m, z20.h
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
-; CHECK-NEXT:    stp q0, q5, [x1, #64]
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
-; CHECK-NEXT:    zip1 z0.d, z19.d, z18.d
-; CHECK-NEXT:    zip1 z5.d, z16.d, z17.d
-; CHECK-NEXT:    stp q2, q4, [x1]
-; CHECK-NEXT:    zip1 z2.d, z3.d, z7.d
-; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
-; CHECK-NEXT:    stp q5, q0, [x1, #96]
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
+; CHECK-NEXT:    zip1 z1.d, z1.d, z16.d
+; CHECK-NEXT:    movprfx z16, z18
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z18.h
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
+; CHECK-NEXT:    stp q0, q3, [x1, #64]
+; CHECK-NEXT:    zip1 z0.d, z20.d, z19.d
+; CHECK-NEXT:    zip1 z3.d, z6.d, z17.d
+; CHECK-NEXT:    stp q2, q1, [x1]
+; CHECK-NEXT:    zip1 z1.d, z7.d, z16.d
+; CHECK-NEXT:    zip1 z2.d, z5.d, z4.d
+; CHECK-NEXT:    stp q3, q0, [x1, #96]
+; CHECK-NEXT:    stp q2, q1, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
@@ -2764,12 +2775,13 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64:
@@ -2802,18 +2814,20 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.s
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.s
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.s
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index 75911e5ff1569..c3dcb0f6d7f1f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -18,14 +18,15 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    lsl z1.s, z1.s, #31
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
-; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    asr z1.s, z1.s, #31
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32:
@@ -74,14 +75,15 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    lsl z1.d, z1.d, #61
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    lsl z0.d, z0.d, #61
-; CHECK-NEXT:    asr z1.d, z1.d, #61
+; CHECK-NEXT:    lsl z1.d, z1.d, #61
 ; CHECK-NEXT:    asr z0.d, z0.d, #61
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    asr z1.d, z1.d, #61
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64:
@@ -116,10 +118,11 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-LABEL: sext_v16i8_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpklo z1.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16:
@@ -176,14 +179,16 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    sunpklo z2.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16:
@@ -369,10 +374,11 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32:
@@ -405,17 +411,20 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-LABEL: sext_v16i8_v16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpklo z1.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    stp q3, q0, [x0, #32]
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    stp q1, q3, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32:
@@ -465,28 +474,33 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    sunpklo z2.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.h, z1.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z6.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z7.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
-; CHECK-NEXT:    stp q5, q3, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #96]
-; CHECK-NEXT:    stp q7, q1, [x1, #32]
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    sunpklo z3.s, z7.h
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    sunpklo z0.s, z6.h
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32:
@@ -664,12 +678,13 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sxtb z1.d, p0/m, z1.d
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sxtb z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64:
@@ -699,17 +714,20 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    stp q3, q0, [x0, #32]
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    stp q1, q3, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64:
@@ -744,31 +762,37 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-LABEL: sext_v16i8_v16i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpklo z1.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z6.d, z1.s
+; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z4.d, z4.s
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    stp q0, q4, [x0]
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    sunpklo z7.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    stp q4, q2, [x0]
+; CHECK-NEXT:    stp q3, q5, [x0, #64]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q6, q1, [x0, #32]
-; CHECK-NEXT:    stp q5, q3, [x0, #64]
-; CHECK-NEXT:    stp q7, q0, [x0, #96]
+; CHECK-NEXT:    stp q2, q6, [x0, #32]
+; CHECK-NEXT:    stp q1, q0, [x0, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
@@ -819,64 +843,69 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v32i8_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z4.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z7, z3
+; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z16.d, z4.s
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    movprfx z16, z0
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z6.s, z2.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
-; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z17, z5
+; CHECK-NEXT:    ext z17.b, z17.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
-; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z19.d, z3.s
+; CHECK-NEXT:    sunpklo z16.d, z16.s
+; CHECK-NEXT:    movprfx z20, z3
+; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z19.d, z4.s
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z17.d, z17.s
 ; CHECK-NEXT:    sunpklo z18.d, z6.s
 ; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    sunpklo z20.d, z1.s
+; CHECK-NEXT:    str q5, [x1]
+; CHECK-NEXT:    stp q0, q16, [x1, #128]
+; CHECK-NEXT:    sunpklo z0.d, z1.s
+; CHECK-NEXT:    sunpklo z16.d, z20.s
+; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    stp q16, q4, [x1, #128]
-; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    sunpklo z16.d, z0.s
-; CHECK-NEXT:    sunpklo z17.d, z17.s
-; CHECK-NEXT:    mov z4.d, z7.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    stp q17, q0, [x1, #16]
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    ext z0.b, z0.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    ext z4.b, z4.b, z7.b, #8
-; CHECK-NEXT:    stp q19, q3, [x1, #160]
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q5, q17, [x1]
-; CHECK-NEXT:    sunpklo z5.d, z6.s
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    stp q20, q1, [x1, #192]
-; CHECK-NEXT:    sunpklo z7.d, z7.s
-; CHECK-NEXT:    sunpklo z1.d, z4.s
-; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    stp q3, q16, [x1, #64]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    ext z3.b, z3.b, z7.b, #8
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q16, q0, [x1, #32]
-; CHECK-NEXT:    stp q18, q5, [x1, #64]
-; CHECK-NEXT:    sunpklo z3.d, z6.s
-; CHECK-NEXT:    stp q7, q1, [x1, #224]
-; CHECK-NEXT:    stp q2, q3, [x1, #96]
+; CHECK-NEXT:    stp q19, q4, [x1, #160]
+; CHECK-NEXT:    sunpklo z4.d, z7.s
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q18, q6, [x1, #192]
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    str q1, [x1, #48]
+; CHECK-NEXT:    stp q2, q0, [x1, #224]
+; CHECK-NEXT:    stp q4, q3, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64:
@@ -1057,10 +1086,11 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-LABEL: sext_v8i16_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32:
@@ -1096,14 +1126,16 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
 ; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    sunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32:
@@ -1189,10 +1221,11 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64:
@@ -1219,17 +1252,20 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-LABEL: sext_v8i16_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    sunpklo z3.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    stp q3, q0, [x0, #32]
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    stp q1, q3, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64:
@@ -1267,28 +1303,33 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
 ; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    sunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z6.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z4.d, z4.s
+; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
-; CHECK-NEXT:    stp q5, q3, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #96]
-; CHECK-NEXT:    stp q7, q1, [x1, #32]
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    sunpklo z3.d, z7.s
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    sunpklo z0.d, z6.s
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64:
@@ -1378,10 +1419,11 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-LABEL: sext_v4i32_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64:
@@ -1409,14 +1451,16 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.s, z0.s, z0.s
 ; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64:
@@ -1469,10 +1513,11 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 ; CHECK-LABEL: zext_v16i8_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16:
@@ -1529,14 +1574,16 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    uunpklo z2.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16:
@@ -1722,10 +1769,11 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32:
@@ -1758,17 +1806,20 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-LABEL: zext_v16i8_v16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    stp q3, q0, [x0, #32]
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    stp q1, q3, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32:
@@ -1818,28 +1869,33 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    uunpklo z2.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z3.h, z1.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z6.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z7.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
-; CHECK-NEXT:    stp q5, q3, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #96]
-; CHECK-NEXT:    stp q7, q1, [x1, #32]
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    uunpklo z3.s, z7.h
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    uunpklo z0.s, z6.h
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32:
@@ -2017,10 +2073,11 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64:
@@ -2050,17 +2107,20 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    stp q3, q0, [x0, #32]
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    stp q1, q3, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64:
@@ -2099,31 +2159,37 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-LABEL: zext_v16i8_v16i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z6.d, z1.s
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z4.d, z4.s
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NEXT:    stp q0, q4, [x0]
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    uunpklo z7.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    stp q4, q2, [x0]
+; CHECK-NEXT:    stp q3, q5, [x0, #64]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q6, q1, [x0, #32]
-; CHECK-NEXT:    stp q5, q3, [x0, #64]
-; CHECK-NEXT:    stp q7, q0, [x0, #96]
+; CHECK-NEXT:    stp q2, q6, [x0, #32]
+; CHECK-NEXT:    stp q1, q0, [x0, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
@@ -2182,64 +2248,69 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v32i8_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z7, z3
+; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z16.d, z4.s
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    movprfx z16, z0
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z6.s, z2.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
-; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z17, z5
+; CHECK-NEXT:    ext z17.b, z17.b, z5.b, #8
+; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    uunpklo z5.d, z5.s
-; CHECK-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z19.d, z3.s
+; CHECK-NEXT:    uunpklo z16.d, z16.s
+; CHECK-NEXT:    movprfx z20, z3
+; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z19.d, z4.s
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z17.d, z17.s
 ; CHECK-NEXT:    uunpklo z18.d, z6.s
 ; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    uunpklo z20.d, z1.s
+; CHECK-NEXT:    str q5, [x1]
+; CHECK-NEXT:    stp q0, q16, [x1, #128]
+; CHECK-NEXT:    uunpklo z0.d, z1.s
+; CHECK-NEXT:    uunpklo z16.d, z20.s
+; CHECK-NEXT:    uunpklo z4.d, z4.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    stp q16, q4, [x1, #128]
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uunpklo z16.d, z0.s
-; CHECK-NEXT:    uunpklo z17.d, z17.s
-; CHECK-NEXT:    mov z4.d, z7.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NEXT:    stp q17, q0, [x1, #16]
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    ext z0.b, z0.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ext z4.b, z4.b, z7.b, #8
-; CHECK-NEXT:    stp q19, q3, [x1, #160]
-; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q5, q17, [x1]
-; CHECK-NEXT:    uunpklo z5.d, z6.s
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    stp q20, q1, [x1, #192]
-; CHECK-NEXT:    uunpklo z7.d, z7.s
-; CHECK-NEXT:    uunpklo z1.d, z4.s
-; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    stp q3, q16, [x1, #64]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    ext z3.b, z3.b, z7.b, #8
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q16, q0, [x1, #32]
-; CHECK-NEXT:    stp q18, q5, [x1, #64]
-; CHECK-NEXT:    uunpklo z3.d, z6.s
-; CHECK-NEXT:    stp q7, q1, [x1, #224]
-; CHECK-NEXT:    stp q2, q3, [x1, #96]
+; CHECK-NEXT:    stp q19, q4, [x1, #160]
+; CHECK-NEXT:    uunpklo z4.d, z7.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q18, q6, [x1, #192]
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    str q1, [x1, #48]
+; CHECK-NEXT:    stp q2, q0, [x1, #224]
+; CHECK-NEXT:    stp q4, q3, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64:
@@ -2443,10 +2514,11 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 ; CHECK-LABEL: zext_v8i16_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32:
@@ -2482,14 +2554,16 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
 ; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32:
@@ -2575,10 +2649,11 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64:
@@ -2607,17 +2682,20 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-LABEL: zext_v8i16_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q2, q1, [x0]
-; CHECK-NEXT:    stp q3, q0, [x0, #32]
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q0, q2, [x0]
+; CHECK-NEXT:    stp q1, q3, [x0, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64:
@@ -2659,28 +2737,33 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
 ; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z6.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z4.d, z4.s
+; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
-; CHECK-NEXT:    stp q5, q3, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #96]
-; CHECK-NEXT:    stp q7, q1, [x1, #32]
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    uunpklo z3.d, z7.s
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    uunpklo z0.d, z6.s
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64:
@@ -2780,10 +2863,11 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 ; CHECK-LABEL: zext_v4i32_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64:
@@ -2813,14 +2897,16 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.s, z0.s, z0.s
 ; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 9497ec88e57b4..372f6a06bf64b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -151,14 +151,14 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.h, z5.b
 ; CHECK-NEXT:    sunpklo z7.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
@@ -279,35 +279,35 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    sunpklo z4.h, z0.b
-; CHECK-NEXT:    sunpklo z2.s, z3.h
-; CHECK-NEXT:    sunpklo z5.s, z4.h
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    movprfx z5, z4
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z7.h, z3.b
-; CHECK-NEXT:    sunpklo z16.h, z4.b
-; CHECK-NEXT:    sunpklo z3.s, z7.h
-; CHECK-NEXT:    sunpklo z4.s, z16.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z7.h, z2.b
+; CHECK-NEXT:    sunpklo z16.h, z3.b
+; CHECK-NEXT:    sunpklo z2.s, z7.h
+; CHECK-NEXT:    sunpklo z3.s, z16.h
 ; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
 ; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    movprfx z6, z4
-; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z3.s
-; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    ldr q4, [x1]
+; CHECK-NEXT:    movprfx z6, z3
+; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z2.s
+; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q3, [x1]
 ; CHECK-NEXT:    sunpklo z16.s, z16.h
-; CHECK-NEXT:    sunpklo z17.h, z4.b
-; CHECK-NEXT:    sunpklo z18.h, z3.b
+; CHECK-NEXT:    sunpklo z17.h, z3.b
+; CHECK-NEXT:    sunpklo z18.h, z2.b
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    sunpklo z19.s, z17.h
 ; CHECK-NEXT:    sunpklo z20.s, z18.h
@@ -316,44 +316,44 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z17.s, z17.h
 ; CHECK-NEXT:    sunpklo z18.s, z18.h
 ; CHECK-NEXT:    sdivr z19.s, p0/m, z19.s, z20.s
-; CHECK-NEXT:    mov z20.d, z3.d
-; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
+; CHECK-NEXT:    movprfx z20, z2
+; CHECK-NEXT:    ext z20.b, z20.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z20.h, z20.b
 ; CHECK-NEXT:    sunpklo z22.s, z20.h
 ; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
-; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    mov z18.d, z4.d
 ; CHECK-NEXT:    sunpklo z20.s, z20.h
-; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
+; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    movprfx z18, z3
+; CHECK-NEXT:    ext z18.b, z18.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z18.h, z18.b
 ; CHECK-NEXT:    sunpklo z21.s, z18.h
 ; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
 ; CHECK-NEXT:    sunpklo z18.s, z18.h
 ; CHECK-NEXT:    sdivr z21.s, p0/m, z21.s, z22.s
-; CHECK-NEXT:    uzp1 z22.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z22.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z23.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z6.h, z7.h, z7.h
 ; CHECK-NEXT:    sdivr z18.s, p0/m, z18.s, z20.s
 ; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z20.h, z17.h, z17.h
+; CHECK-NEXT:    uzp1 z16.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z17.h, z7.h, z7.h
 ; CHECK-NEXT:    splice z7.h, p0, { z22.h, z23.h }
-; CHECK-NEXT:    splice z5.h, p0, { z5.h, z6.h }
-; CHECK-NEXT:    uzp1 z16.h, z21.h, z21.h
-; CHECK-NEXT:    splice z2.h, p0, { z19.h, z20.h }
+; CHECK-NEXT:    uzp1 z4.h, z21.h, z21.h
+; CHECK-NEXT:    splice z6.h, p0, { z19.h, z20.h }
+; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
+; CHECK-NEXT:    splice z4.h, p0, { z4.h, z5.h }
+; CHECK-NEXT:    splice z5.h, p0, { z16.h, z17.h }
+; CHECK-NEXT:    uzp1 z16.b, z6.b, z6.b
 ; CHECK-NEXT:    uzp1 z6.b, z7.b, z7.b
-; CHECK-NEXT:    uzp1 z7.b, z5.b, z5.b
-; CHECK-NEXT:    uzp1 z17.h, z18.h, z18.h
-; CHECK-NEXT:    splice z16.h, p0, { z16.h, z17.h }
-; CHECK-NEXT:    uzp1 z17.b, z2.b, z2.b
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z17.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z7.b, z5.b, z5.b
+; CHECK-NEXT:    splice z4.b, p0, { z16.b, z17.b }
 ; CHECK-NEXT:    splice z5.b, p0, { z6.b, z7.b }
-; CHECK-NEXT:    uzp1 z18.b, z16.b, z16.b
-; CHECK-NEXT:    splice z2.b, p0, { z17.b, z18.b }
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    mls z2.b, p0/m, z4.b, z3.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z5.b, z1.b
-; CHECK-NEXT:    msb z2.b, p0/m, z4.b, z3.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
@@ -590,13 +590,13 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
@@ -668,21 +668,21 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    sunpklo z5.s, z4.h
-; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    movprfx z16, z0
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z6.s, z3.h
-; CHECK-NEXT:    mov z7.d, z3.d
 ; CHECK-NEXT:    sunpklo z16.s, z16.h
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    movprfx z7, z3
 ; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    mov z6.d, z4.d
+; CHECK-NEXT:    movprfx z6, z4
 ; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
@@ -1201,14 +1201,14 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.h, z5.b
 ; CHECK-NEXT:    uunpklo z7.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
@@ -1329,35 +1329,35 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    uunpklo z4.h, z0.b
-; CHECK-NEXT:    uunpklo z2.s, z3.h
-; CHECK-NEXT:    uunpklo z5.s, z4.h
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    movprfx z5, z4
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z7.h, z3.b
-; CHECK-NEXT:    uunpklo z16.h, z4.b
-; CHECK-NEXT:    uunpklo z3.s, z7.h
-; CHECK-NEXT:    uunpklo z4.s, z16.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z7.h, z2.b
+; CHECK-NEXT:    uunpklo z16.h, z3.b
+; CHECK-NEXT:    uunpklo z2.s, z7.h
+; CHECK-NEXT:    uunpklo z3.s, z16.h
 ; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
 ; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    movprfx z6, z4
-; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z3.s
-; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    ldr q4, [x1]
+; CHECK-NEXT:    movprfx z6, z3
+; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z2.s
+; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q3, [x1]
 ; CHECK-NEXT:    uunpklo z16.s, z16.h
-; CHECK-NEXT:    uunpklo z17.h, z4.b
-; CHECK-NEXT:    uunpklo z18.h, z3.b
+; CHECK-NEXT:    uunpklo z17.h, z3.b
+; CHECK-NEXT:    uunpklo z18.h, z2.b
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uunpklo z19.s, z17.h
 ; CHECK-NEXT:    uunpklo z20.s, z18.h
@@ -1366,44 +1366,44 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z17.s, z17.h
 ; CHECK-NEXT:    uunpklo z18.s, z18.h
 ; CHECK-NEXT:    udivr z19.s, p0/m, z19.s, z20.s
-; CHECK-NEXT:    mov z20.d, z3.d
-; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
+; CHECK-NEXT:    movprfx z20, z2
+; CHECK-NEXT:    ext z20.b, z20.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z20.h, z20.b
 ; CHECK-NEXT:    uunpklo z22.s, z20.h
 ; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
-; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    mov z18.d, z4.d
 ; CHECK-NEXT:    uunpklo z20.s, z20.h
-; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
+; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    movprfx z18, z3
+; CHECK-NEXT:    ext z18.b, z18.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z18.h, z18.b
 ; CHECK-NEXT:    uunpklo z21.s, z18.h
 ; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
 ; CHECK-NEXT:    uunpklo z18.s, z18.h
 ; CHECK-NEXT:    udivr z21.s, p0/m, z21.s, z22.s
-; CHECK-NEXT:    uzp1 z22.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z22.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z23.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z6.h, z7.h, z7.h
 ; CHECK-NEXT:    udivr z18.s, p0/m, z18.s, z20.s
 ; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z20.h, z17.h, z17.h
+; CHECK-NEXT:    uzp1 z16.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z17.h, z7.h, z7.h
 ; CHECK-NEXT:    splice z7.h, p0, { z22.h, z23.h }
-; CHECK-NEXT:    splice z5.h, p0, { z5.h, z6.h }
-; CHECK-NEXT:    uzp1 z16.h, z21.h, z21.h
-; CHECK-NEXT:    splice z2.h, p0, { z19.h, z20.h }
+; CHECK-NEXT:    uzp1 z4.h, z21.h, z21.h
+; CHECK-NEXT:    splice z6.h, p0, { z19.h, z20.h }
+; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
+; CHECK-NEXT:    splice z4.h, p0, { z4.h, z5.h }
+; CHECK-NEXT:    splice z5.h, p0, { z16.h, z17.h }
+; CHECK-NEXT:    uzp1 z16.b, z6.b, z6.b
 ; CHECK-NEXT:    uzp1 z6.b, z7.b, z7.b
-; CHECK-NEXT:    uzp1 z7.b, z5.b, z5.b
-; CHECK-NEXT:    uzp1 z17.h, z18.h, z18.h
-; CHECK-NEXT:    splice z16.h, p0, { z16.h, z17.h }
-; CHECK-NEXT:    uzp1 z17.b, z2.b, z2.b
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z17.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z7.b, z5.b, z5.b
+; CHECK-NEXT:    splice z4.b, p0, { z16.b, z17.b }
 ; CHECK-NEXT:    splice z5.b, p0, { z6.b, z7.b }
-; CHECK-NEXT:    uzp1 z18.b, z16.b, z16.b
-; CHECK-NEXT:    splice z2.b, p0, { z17.b, z18.b }
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    mls z2.b, p0/m, z4.b, z3.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z5.b, z1.b
-; CHECK-NEXT:    msb z2.b, p0/m, z4.b, z3.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
@@ -1640,13 +1640,13 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
@@ -1718,21 +1718,21 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
-; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    movprfx z16, z0
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z6.s, z3.h
-; CHECK-NEXT:    mov z7.d, z3.d
 ; CHECK-NEXT:    uunpklo z16.s, z16.h
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    movprfx z7, z3
 ; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    mov z6.d, z4.d
+; CHECK-NEXT:    movprfx z6, z4
 ; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 7df362826d052..f7fadaa1217bc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -256,12 +256,13 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32:
@@ -306,18 +307,20 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    ucvtf z2.s, p0/m, z2.s
-; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.s
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    ucvtf z2.s, p0/m, z2.s
+; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32:
@@ -442,12 +445,13 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64:
@@ -481,21 +485,24 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q2, q1, [x1]
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
+; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
@@ -542,43 +549,43 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z5.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    uunpklo z7.d, z7.s
+; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    stp q5, q0, [x1, #64]
-; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    stp q1, q4, [x1]
-; CHECK-NEXT:    movprfx z1, z6
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z0, z7
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z7.d
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
-; CHECK-NEXT:    stp q2, q1, [x1, #96]
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z7.d
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    movprfx z0, z6
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z6.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
@@ -997,12 +1004,13 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64:
@@ -1035,18 +1043,20 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64:
@@ -1669,12 +1679,13 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32:
@@ -1719,18 +1730,20 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    scvtf z2.s, p0/m, z2.s
-; CHECK-NEXT:    scvtf z3.s, p0/m, z3.s
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    scvtf z2.s, p0/m, z2.s
+; CHECK-NEXT:    scvtf z3.s, p0/m, z3.s
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32:
@@ -1833,12 +1846,13 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64:
@@ -1872,21 +1886,24 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    sunpklo z1.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q2, q1, [x1]
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
+; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
@@ -1933,43 +1950,43 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z5.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    movprfx z5, z3
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    sunpklo z7.d, z7.s
+; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    stp q5, q0, [x1, #64]
-; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    stp q1, q4, [x1]
-; CHECK-NEXT:    movprfx z1, z6
-; CHECK-NEXT:    scvtf z1.d, p0/m, z6.d
-; CHECK-NEXT:    movprfx z0, z7
-; CHECK-NEXT:    scvtf z0.d, p0/m, z7.d
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
-; CHECK-NEXT:    stp q2, q1, [x1, #96]
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    scvtf z3.d, p0/m, z7.d
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    movprfx z0, z6
+; CHECK-NEXT:    scvtf z0.d, p0/m, z6.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
@@ -2293,12 +2310,13 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    sunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64:
@@ -2331,18 +2349,20 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    sunpklo z2.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    stp q3, q1, [x1]
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64:
@@ -2388,15 +2408,15 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q5, q3, [x0]
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    mov z7.d, z5.d
+; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    movprfx z4, z1
 ; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    movprfx z6, z3
 ; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    movprfx z7, z5
 ; CHECK-NEXT:    ext z7.b, z7.b, z5.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 3627390b5edfa..daf2734ce7d55 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -99,15 +99,16 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ldp q3, q4, [x0]
-; CHECK-NEXT:    add z2.s, z0.s, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    add z3.s, z3.s, z3.s
-; CHECK-NEXT:    add z4.s, z4.s, z4.s
-; CHECK-NEXT:    mov z0.s, s0
-; CHECK-NEXT:    stp q1, q2, [x0, #32]
-; CHECK-NEXT:    stp q3, q4, [x0]
+; CHECK-NEXT:    add z5.s, z0.s, z0.s
+; CHECK-NEXT:    mov z0.s, s2
+; CHECK-NEXT:    add z2.s, z3.s, z3.s
+; CHECK-NEXT:    add z3.s, z4.s, z4.s
+; CHECK-NEXT:    stp q1, q5, [x0, #32]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: test2:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index 48a642c908bfe..42b947604b860 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1815,17 +1815,18 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.h, z0.b
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    lsl z1.h, z1.h, #15
-; CHECK-NEXT:    asr z1.h, z1.h, #15
+; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
-; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    asr z1.h, z1.h, #15
+; CHECK-NEXT:    cmpne p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
@@ -2572,15 +2573,16 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    lsl z1.d, z1.d, #63
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    lsl z0.d, z0.d, #63
-; CHECK-NEXT:    asr z1.d, z1.d, #63
+; CHECK-NEXT:    lsl z1.d, z1.d, #63
 ; CHECK-NEXT:    asr z0.d, z0.d, #63
-; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    asr z1.d, z1.d, #63
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index 265480b571970..9b3da75be47ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -797,20 +797,21 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpklo z1.h, z0.b
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    lsl z1.h, z1.h, #15
-; CHECK-NEXT:    asr z1.h, z1.h, #15
+; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
+; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
-; CHECK-NEXT:    cmpne p1.h, p0/z, z0.h, #0
-; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; CHECK-NEXT:    mov z0.h, #0 // =0x0
-; CHECK-NEXT:    st1h { z0.h }, p1, [x0, x8, lsl #1]
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    asr z1.h, z1.h, #15
+; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    mov z1.h, #0 // =0x0
+; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16f16:
@@ -1165,18 +1166,19 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    movprfx z1, z0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    lsl z1.d, z1.d, #63
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    lsl z0.d, z0.d, #63
-; CHECK-NEXT:    asr z1.d, z1.d, #63
+; CHECK-NEXT:    lsl z1.d, z1.d, #63
 ; CHECK-NEXT:    asr z0.d, z0.d, #63
-; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
-; CHECK-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    asr z1.d, z1.d, #63
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    st1d { z1.d }, p1, [x0, x8, lsl #3]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v4f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 688537704a6f7..7363c306033a1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -37,34 +37,39 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
 ; STREAMING-SVE:       // %bb.0:
 ; STREAMING-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
-; STREAMING-SVE-NEXT:    uunpklo z2.h, z1.b
 ; STREAMING-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; STREAMING-SVE-NEXT:    uunpklo z3.h, z0.b
+; STREAMING-SVE-NEXT:    movprfx z2, z1
+; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z1.b, #8
+; STREAMING-SVE-NEXT:    movprfx z3, z0
+; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
-; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
-; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    uunpklo z1.h, z1.b
 ; STREAMING-SVE-NEXT:    uunpklo z0.h, z0.b
-; STREAMING-SVE-NEXT:    uunpklo z4.s, z2.h
-; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z2.b, #8
-; STREAMING-SVE-NEXT:    uunpklo z6.s, z3.h
-; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z3.b, #8
-; STREAMING-SVE-NEXT:    mov z5.d, z1.d
-; STREAMING-SVE-NEXT:    uunpklo z7.s, z0.h
-; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
-; STREAMING-SVE-NEXT:    uunpklo z2.s, z2.h
-; STREAMING-SVE-NEXT:    uunpklo z3.s, z3.h
-; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
-; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z1.b, #8
+; STREAMING-SVE-NEXT:    uunpklo z2.h, z2.b
+; STREAMING-SVE-NEXT:    uunpklo z3.h, z3.b
+; STREAMING-SVE-NEXT:    movprfx z4, z1
+; STREAMING-SVE-NEXT:    ext z4.b, z4.b, z1.b, #8
+; STREAMING-SVE-NEXT:    movprfx z7, z0
+; STREAMING-SVE-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    uunpklo z1.s, z1.h
 ; STREAMING-SVE-NEXT:    uunpklo z0.s, z0.h
-; STREAMING-SVE-NEXT:    add z2.s, z3.s, z2.s
+; STREAMING-SVE-NEXT:    movprfx z5, z2
+; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z2.b, #8
+; STREAMING-SVE-NEXT:    movprfx z6, z3
+; STREAMING-SVE-NEXT:    ext z6.b, z6.b, z3.b, #8
+; STREAMING-SVE-NEXT:    uunpklo z2.s, z2.h
+; STREAMING-SVE-NEXT:    uunpklo z4.s, z4.h
+; STREAMING-SVE-NEXT:    uunpklo z7.s, z7.h
+; STREAMING-SVE-NEXT:    uunpklo z3.s, z3.h
 ; STREAMING-SVE-NEXT:    uunpklo z5.s, z5.h
-; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
-; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
-; STREAMING-SVE-NEXT:    add z1.s, z4.s, z1.s
-; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
-; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
+; STREAMING-SVE-NEXT:    uunpklo z6.s, z6.h
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z1.s
+; STREAMING-SVE-NEXT:    add z1.s, z3.s, z2.s
+; STREAMING-SVE-NEXT:    add z2.s, z7.s, z4.s
+; STREAMING-SVE-NEXT:    add z3.s, z6.s, z5.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z1.s
+; STREAMING-SVE-NEXT:    add z1.s, z2.s, z3.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
 ; STREAMING-SVE-NEXT:    fmov w0, s0
 ; STREAMING-SVE-NEXT:    ret
@@ -104,34 +109,39 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
 ; STREAMING-SVE:       // %bb.0:
 ; STREAMING-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
-; STREAMING-SVE-NEXT:    sunpklo z2.h, z1.b
 ; STREAMING-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; STREAMING-SVE-NEXT:    sunpklo z3.h, z0.b
+; STREAMING-SVE-NEXT:    movprfx z2, z1
+; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z1.b, #8
+; STREAMING-SVE-NEXT:    movprfx z3, z0
+; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
-; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
-; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    sunpklo z1.h, z1.b
 ; STREAMING-SVE-NEXT:    sunpklo z0.h, z0.b
-; STREAMING-SVE-NEXT:    sunpklo z4.s, z2.h
-; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z2.b, #8
-; STREAMING-SVE-NEXT:    sunpklo z6.s, z3.h
-; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z3.b, #8
-; STREAMING-SVE-NEXT:    mov z5.d, z1.d
-; STREAMING-SVE-NEXT:    sunpklo z7.s, z0.h
-; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
-; STREAMING-SVE-NEXT:    sunpklo z2.s, z2.h
-; STREAMING-SVE-NEXT:    sunpklo z3.s, z3.h
-; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
-; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z1.b, #8
+; STREAMING-SVE-NEXT:    sunpklo z2.h, z2.b
+; STREAMING-SVE-NEXT:    sunpklo z3.h, z3.b
+; STREAMING-SVE-NEXT:    movprfx z4, z1
+; STREAMING-SVE-NEXT:    ext z4.b, z4.b, z1.b, #8
+; STREAMING-SVE-NEXT:    movprfx z7, z0
+; STREAMING-SVE-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    sunpklo z1.s, z1.h
 ; STREAMING-SVE-NEXT:    sunpklo z0.s, z0.h
-; STREAMING-SVE-NEXT:    add z2.s, z3.s, z2.s
+; STREAMING-SVE-NEXT:    movprfx z5, z2
+; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z2.b, #8
+; STREAMING-SVE-NEXT:    movprfx z6, z3
+; STREAMING-SVE-NEXT:    ext z6.b, z6.b, z3.b, #8
+; STREAMING-SVE-NEXT:    sunpklo z2.s, z2.h
+; STREAMING-SVE-NEXT:    sunpklo z4.s, z4.h
+; STREAMING-SVE-NEXT:    sunpklo z7.s, z7.h
+; STREAMING-SVE-NEXT:    sunpklo z3.s, z3.h
 ; STREAMING-SVE-NEXT:    sunpklo z5.s, z5.h
-; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
-; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
-; STREAMING-SVE-NEXT:    add z1.s, z4.s, z1.s
-; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
-; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
+; STREAMING-SVE-NEXT:    sunpklo z6.s, z6.h
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z1.s
+; STREAMING-SVE-NEXT:    add z1.s, z3.s, z2.s
+; STREAMING-SVE-NEXT:    add z2.s, z7.s, z4.s
+; STREAMING-SVE-NEXT:    add z3.s, z6.s, z5.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z1.s
+; STREAMING-SVE-NEXT:    add z1.s, z2.s, z3.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
 ; STREAMING-SVE-NEXT:    fmov w0, s0
 ; STREAMING-SVE-NEXT:    ret

>From 56ac99a1339398b952815334df020c568d079bda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Thu, 7 Aug 2025 12:09:42 +0000
Subject: [PATCH 2/2] [AArch64] WIP Regalloc hints for EXT_ZZZI

This tries to ensure that the dst and first src register are mapped to
the same physical register. This isn't always possible because the
MachineScheduler has already moved instructions in a way that causes
interferences if both virt regs get mapped to the same phys reg.

WIP because there is probably a better place to do this.
---
 .../AArch64/AArch64PostCoalescerPass.cpp      | 10 ++++--
 .../AArch64/named-vector-shuffles-sve.ll      | 36 +++++++++----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
index cdf2822f3ed9d..b7d69b68af4ee 100644
--- a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
@@ -53,9 +53,6 @@ bool AArch64PostCoalescer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
-  if (!FuncInfo->hasStreamingModeChanges())
-    return false;
 
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
@@ -86,6 +83,13 @@ bool AArch64PostCoalescer::runOnMachineFunction(MachineFunction &MF) {
         Changed = true;
         break;
       }
+      case AArch64::EXT_ZZZI:
+        Register DstReg = MI.getOperand(0).getReg();
+        Register SrcReg1 = MI.getOperand(1).getReg();
+        if (SrcReg1 != DstReg) {
+          MRI->setRegAllocationHint(DstReg, 0, SrcReg1);
+        }
+        break;
       }
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 96311c2849f7c..69e805d9ca2ee 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -256,13 +256,12 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
 define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    and z0.d, z0.d, #0x1
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
   ret <vscale x 2 x i1> %res
@@ -272,13 +271,12 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    and z0.s, z0.s, #0x1
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    and z1.s, z1.s, #0x1
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
   ret <vscale x 4 x i1> %res
@@ -288,13 +286,12 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    and z1.h, z1.h, #0x1
+; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
   ret <vscale x 8 x i1> %res
@@ -304,13 +301,12 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i1_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z0.d, z1.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    and z0.b, z0.b, #0x1
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    and z1.b, z1.b, #0x1
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
   ret <vscale x 16 x i1> %res



More information about the llvm-branch-commits mailing list