[llvm] [AArch64][ISel] Subvector extracts can use undef for second EXT input (PR #151729)

Gaëtan Bossu via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 1 09:53:10 PDT 2025


https://github.com/gbossu created https://github.com/llvm/llvm-project/pull/151729

👉  This is a chained PR.

Extracting a subvector using vector_splice only requires one input, so let's mark the second input as undef to give the register allocator more freedom.

This will later allow us to use the SVE2 constructive variant of EXT without requiring a MOV. That is because that variant of EXT requires two consecutive Z registers as input. As a consequence, extracting a subvector from e.g. z2 into z0 would require:
  z3 = MOV z2
  z0 = EXT_ZZI_B { z2, z3 }, idx

With this change, the z3 part of the { z2, z3 } tuple will be marked as undef, allowing the MOV to be simplified.

We just need to add patterns for EXT_ZZI_B now, currently only the the destructive EXT_ZZI variant is selected.

>From ebcb4929004ae3f08b2ca3d5d246f29aa73600e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu at arm.com>
Date: Thu, 31 Jul 2025 13:07:28 +0000
Subject: [PATCH] [AArch64][ISel] Subvector extracts can use undef for second
 EXT input

This will later allow us to use the SVE2 constructive variant of EXT
without requiring a MOV. That is because that variant of EXT requires
two consecutive Z registers as input. As a consequence, extracting a
subvector from e.g. z2 into z0 would require:
  z3 = MOV z2
  z0 = EXT_ZZI_B { z2, z3 }, idx

With this change, the z3 part of the { z2, z3 } tuple will be marked as
undef, allowing the MOV to be simplified.

We just need to add patterns for EXT_ZZI_B now, currently only the the
destructive EXT_ZZI variant is selected.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +-
 .../AArch64/get-active-lane-mask-extract.ll   |   8 +-
 .../AArch64/sve-fixed-length-int-div.ll       |  52 +--
 .../AArch64/sve-fixed-length-int-rem.ll       |  44 +--
 .../sve-fixed-length-masked-scatter.ll        |  12 +-
 .../AArch64/sve-fixed-length-shuffles.ll      |   2 +-
 ...aming-mode-fixed-length-fp-extend-trunc.ll |   2 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll |  20 +-
 ...sve-streaming-mode-fixed-length-int-div.ll | 120 +++---
 ...streaming-mode-fixed-length-int-extends.ll | 208 +++++------
 ...sve-streaming-mode-fixed-length-int-rem.ll |  88 ++---
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 116 +++---
 ...-streaming-mode-fixed-length-reductions.ll |  40 +-
 .../sve2-fixed-length-extract-subvector.ll    | 341 ++++++++++++++++++
 14 files changed, 700 insertions(+), 357 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c9fc67bb0119..16f02293c78f3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15544,7 +15544,9 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
 
     assert(InVT.isScalableVector() && "Unexpected vector type!");
     // Move requested subvector to the start of the vector and try again.
-    SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
+    // There's no need for a second input to vector_splice, so use undef there.
+    SDValue Splice =
+        DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, DAG.getUNDEF(InVT), Idx);
     return convertFromScalableVector(DAG, VT, Splice);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
index 5e01612e3881a..50975d16c7e9e 100644
--- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -180,7 +180,7 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 {
 ; CHECK-SVE-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-SVE-NEXT:    fmov s0, w8
 ; CHECK-SVE-NEXT:    mov v0.s[1], v1.s[1]
-; CHECK-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-SVE-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-SVE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-SVE-NEXT:    b use
@@ -192,7 +192,7 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 {
 ; CHECK-SVE2p1-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-SVE2p1-NEXT:    fmov s0, w8
 ; CHECK-SVE2p1-NEXT:    mov v0.s[1], v1.s[1]
-; CHECK-SVE2p1-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-SVE2p1-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-SVE2p1-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SVE2p1-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-SVE2p1-NEXT:    b use
@@ -204,10 +204,10 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 {
 ; CHECK-SME2-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-SME2-NEXT:    fmov s2, w8
 ; CHECK-SME2-NEXT:    mov z0.s, z1.s[1]
-; CHECK-SME2-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-SME2-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-SME2-NEXT:    zip1 z0.s, z2.s, z0.s
+; CHECK-SME2-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-SME2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-SME2-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-SME2-NEXT:    b use
     %r = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %i, i64 %n)
     %v0 = call <2 x i1> @llvm.vector.extract.v2i1.nxv4i1.i64(<vscale x 4 x i1> %r, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index 0ddf434eff930..4d29b1724a8dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -125,7 +125,7 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
@@ -210,7 +210,7 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
@@ -239,24 +239,24 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
@@ -398,7 +398,7 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
@@ -476,7 +476,7 @@ define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
@@ -858,7 +858,7 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
@@ -930,12 +930,12 @@ define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
 ; CHECK-NEXT:    ld1b { z1.h }, p0/z, [x0]
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    udivr z0.s, p1/m, z0.s, z1.s
 ; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
@@ -959,24 +959,24 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
@@ -1118,7 +1118,7 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
@@ -1187,7 +1187,7 @@ define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #128
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 2d78945399176..9ce174b228dd5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -129,8 +129,8 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z4.s, z2.h
 ; VBITS_GE_256-NEXT:    sunpklo z5.s, z3.h
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
@@ -222,8 +222,8 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
@@ -254,8 +254,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
@@ -263,15 +263,15 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z5.h, z5.b
 ; CHECK-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z6.s, p1/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
@@ -425,7 +425,7 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    mov z3.d, z1.d
 ; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
 ; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
@@ -512,7 +512,7 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p1.h, vl64
@@ -947,8 +947,8 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z4.s, z2.h
 ; VBITS_GE_256-NEXT:    uunpklo z5.s, z3.h
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
@@ -1040,8 +1040,8 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
@@ -1072,8 +1072,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
@@ -1081,15 +1081,15 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z5.h, z5.b
 ; CHECK-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z6.s, p1/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
@@ -1243,7 +1243,7 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    mov z3.d, z1.d
 ; VBITS_GE_256-NEXT:    uunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
 ; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
@@ -1330,7 +1330,7 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p1.h, vl64
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ed03f9b322432..1cbe97a54e900 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -338,14 +338,14 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
 ; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z3.d]
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
@@ -715,14 +715,14 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
 ; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z3.d]
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index c48ee3939bd2e..2560f45a153b4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -13,7 +13,7 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) v
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #16
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; CHECK-NEXT:    st2 { v0.4s, v1.4s }, [x0]
 ; CHECK-NEXT:    ret
   %splat = shufflevector <2 x i32> %b, <2 x i32> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 56149e99b15f8..e3d0a72c74b87 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -127,7 +127,7 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 94d756a36ab92..86de930cecca9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -323,11 +323,11 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvtzu z2.s, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z3.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -606,7 +606,7 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z4.h, z1.h[3]
 ; CHECK-NEXT:    mov z7.h, z1.h[2]
 ; CHECK-NEXT:    mov z17.h, z0.h[1]
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
@@ -1109,11 +1109,11 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.s
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.s
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -2018,11 +2018,11 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvtzs z2.s, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzs z3.s, p0/m, z3.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -2302,7 +2302,7 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z4.h, z1.h[3]
 ; CHECK-NEXT:    mov z7.h, z1.h[2]
 ; CHECK-NEXT:    mov z17.h, z0.h[1]
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
 ; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
@@ -2805,11 +2805,11 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.s
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 1fdcd4f826870..ae7c676172867 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -63,7 +63,7 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
@@ -129,24 +129,24 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z5.h, z2.h, z2.h
@@ -246,45 +246,45 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q2, [x0, #16]
 ; CHECK-NEXT:    sunpklo z1.h, z3.b
 ; CHECK-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z7.h, z6.b
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z0.s, z1.h
 ; CHECK-NEXT:    sunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
 ; CHECK-NEXT:    sunpklo z17.s, z7.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    sunpklo z6.h, z6.b
+; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z5.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z5.s
+; CHECK-NEXT:    sunpklo z6.h, z6.b
 ; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    sunpklo z4.h, z2.b
 ; CHECK-NEXT:    sunpklo z2.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z5.s
 ; CHECK-NEXT:    ldr q5, [x0]
 ; CHECK-NEXT:    sunpklo z16.h, z5.b
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.h, z5.b
 ; CHECK-NEXT:    sunpklo z18.s, z16.h
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z16.s, z16.h
 ; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
 ; CHECK-NEXT:    sunpklo z18.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    sunpklo z16.s, z6.h
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    uzp1 z20.h, z17.h, z17.h
 ; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
@@ -539,7 +539,7 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
@@ -604,17 +604,17 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    sunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uzp1 z4.h, z5.h, z5.h
@@ -977,7 +977,7 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
@@ -1043,24 +1043,24 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z5.h, z2.h, z2.h
@@ -1160,45 +1160,45 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q2, [x0, #16]
 ; CHECK-NEXT:    uunpklo z1.h, z3.b
 ; CHECK-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z7.h, z6.b
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z0.s, z1.h
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
 ; CHECK-NEXT:    uunpklo z17.s, z7.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    uunpklo z6.h, z6.b
+; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z5.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z5.s
+; CHECK-NEXT:    uunpklo z6.h, z6.b
 ; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.h, z2.b
 ; CHECK-NEXT:    uunpklo z2.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z5.s
 ; CHECK-NEXT:    ldr q5, [x0]
 ; CHECK-NEXT:    uunpklo z16.h, z5.b
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.h, z5.b
 ; CHECK-NEXT:    uunpklo z18.s, z16.h
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z16.s, z16.h
 ; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
 ; CHECK-NEXT:    uunpklo z18.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uunpklo z16.s, z6.h
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    uzp1 z20.h, z17.h, z17.h
 ; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
@@ -1453,7 +1453,7 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
@@ -1518,17 +1518,17 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uzp1 z4.h, z5.h, z5.h
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index 75911e5ff1569..b022c19363ed6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -179,7 +179,7 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    sunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -409,10 +409,10 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
@@ -468,23 +468,23 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    sunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z7.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
@@ -703,7 +703,7 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
@@ -748,22 +748,22 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z6.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z6.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z7.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    stp q4, q2, [x0]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q6, q1, [x0, #32]
@@ -824,56 +824,56 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z4.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z16.d, z4.s
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z6.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    sunpklo z20.d, z1.s
 ; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z19.d, z3.s
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
 ; CHECK-NEXT:    sunpklo z18.d, z6.s
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    sunpklo z20.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z17.b, z17.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z19.d, z3.s
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    stp q16, q4, [x1, #128]
-; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z16.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z17.d, z17.s
 ; CHECK-NEXT:    mov z4.d, z7.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    ext z4.b, z4.b, z7.b, #8
-; CHECK-NEXT:    stp q19, q3, [x1, #160]
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q5, q17, [x1]
 ; CHECK-NEXT:    sunpklo z5.d, z6.s
 ; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    stp q20, q1, [x1, #192]
-; CHECK-NEXT:    sunpklo z7.d, z7.s
-; CHECK-NEXT:    sunpklo z1.d, z4.s
-; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    stp q19, q3, [x1, #160]
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    stp q16, q0, [x1, #32]
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    stp q20, q1, [x1, #192]
 ; CHECK-NEXT:    stp q18, q5, [x1, #64]
+; CHECK-NEXT:    sunpklo z1.d, z4.s
 ; CHECK-NEXT:    sunpklo z3.d, z6.s
 ; CHECK-NEXT:    stp q7, q1, [x1, #224]
 ; CHECK-NEXT:    stp q2, q3, [x1, #96]
@@ -1099,7 +1099,7 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -1223,10 +1223,10 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
@@ -1270,23 +1270,23 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z6.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
@@ -1412,7 +1412,7 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    sunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -1532,7 +1532,7 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-NEXT:    uunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -1762,10 +1762,10 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
@@ -1821,23 +1821,23 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    uunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z7.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
@@ -2054,7 +2054,7 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
@@ -2103,22 +2103,22 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z6.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z6.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z7.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    stp q4, q2, [x0]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q6, q1, [x0, #32]
@@ -2187,56 +2187,56 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z16.d, z4.s
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z6.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    mov z17.d, z5.d
-; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    uunpklo z20.d, z1.s
 ; CHECK-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z19.d, z3.s
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
 ; CHECK-NEXT:    uunpklo z18.d, z6.s
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    uunpklo z20.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z17.b, z17.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z19.d, z3.s
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    stp q16, q4, [x1, #128]
-; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z16.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z17.d, z17.s
 ; CHECK-NEXT:    mov z4.d, z7.d
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ext z4.b, z4.b, z7.b, #8
-; CHECK-NEXT:    stp q19, q3, [x1, #160]
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z7.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q5, q17, [x1]
 ; CHECK-NEXT:    uunpklo z5.d, z6.s
 ; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    stp q20, q1, [x1, #192]
-; CHECK-NEXT:    uunpklo z7.d, z7.s
-; CHECK-NEXT:    uunpklo z1.d, z4.s
-; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    stp q19, q3, [x1, #160]
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    stp q16, q0, [x1, #32]
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    stp q20, q1, [x1, #192]
 ; CHECK-NEXT:    stp q18, q5, [x1, #64]
+; CHECK-NEXT:    uunpklo z1.d, z4.s
 ; CHECK-NEXT:    uunpklo z3.d, z6.s
 ; CHECK-NEXT:    stp q7, q1, [x1, #224]
 ; CHECK-NEXT:    stp q2, q3, [x1, #96]
@@ -2485,7 +2485,7 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -2611,10 +2611,10 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
@@ -2662,23 +2662,23 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z6.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
 ; CHECK-NEXT:    stp q6, q0, [x1, #96]
 ; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
@@ -2816,7 +2816,7 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 9497ec88e57b4..bffef1352e44f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -68,8 +68,8 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
@@ -146,8 +146,8 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
@@ -155,14 +155,14 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.h, z5.b
 ; CHECK-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
@@ -283,8 +283,8 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z4.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z3.h
 ; CHECK-NEXT:    sunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z5.s
@@ -292,14 +292,14 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z7.h, z3.b
 ; CHECK-NEXT:    sunpklo z16.h, z4.b
 ; CHECK-NEXT:    sunpklo z3.s, z7.h
 ; CHECK-NEXT:    sunpklo z4.s, z16.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    movprfx z6, z4
 ; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z3.s
@@ -311,23 +311,23 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    sunpklo z19.s, z17.h
 ; CHECK-NEXT:    sunpklo z20.s, z18.h
-; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
-; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    ext z17.b, z17.b, z0.b, #8
+; CHECK-NEXT:    ext z18.b, z18.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z17.s, z17.h
 ; CHECK-NEXT:    sunpklo z18.s, z18.h
 ; CHECK-NEXT:    sdivr z19.s, p0/m, z19.s, z20.s
 ; CHECK-NEXT:    mov z20.d, z3.d
-; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
+; CHECK-NEXT:    ext z20.b, z20.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z20.h, z20.b
 ; CHECK-NEXT:    sunpklo z22.s, z20.h
-; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
+; CHECK-NEXT:    ext z20.b, z20.b, z0.b, #8
 ; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
 ; CHECK-NEXT:    mov z18.d, z4.d
 ; CHECK-NEXT:    sunpklo z20.s, z20.h
-; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
+; CHECK-NEXT:    ext z18.b, z18.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z18.h, z18.b
 ; CHECK-NEXT:    sunpklo z21.s, z18.h
-; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    ext z18.b, z18.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z18.s, z18.h
 ; CHECK-NEXT:    sdivr z21.s, p0/m, z21.s, z22.s
 ; CHECK-NEXT:    uzp1 z22.h, z2.h, z2.h
@@ -595,7 +595,7 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
@@ -675,15 +675,15 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
 ; CHECK-NEXT:    mov z7.d, z3.d
 ; CHECK-NEXT:    sunpklo z16.s, z16.h
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    mov z6.d, z4.d
-; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uzp1 z16.h, z5.h, z5.h
@@ -1118,8 +1118,8 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
@@ -1196,8 +1196,8 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
@@ -1205,14 +1205,14 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.h, z5.b
 ; CHECK-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
@@ -1333,8 +1333,8 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z4.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z3.h
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z5.s
@@ -1342,14 +1342,14 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z7.h, z3.b
 ; CHECK-NEXT:    uunpklo z16.h, z4.b
 ; CHECK-NEXT:    uunpklo z3.s, z7.h
 ; CHECK-NEXT:    uunpklo z4.s, z16.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    movprfx z6, z4
 ; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z3.s
@@ -1361,23 +1361,23 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uunpklo z19.s, z17.h
 ; CHECK-NEXT:    uunpklo z20.s, z18.h
-; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
-; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    ext z17.b, z17.b, z0.b, #8
+; CHECK-NEXT:    ext z18.b, z18.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z17.s, z17.h
 ; CHECK-NEXT:    uunpklo z18.s, z18.h
 ; CHECK-NEXT:    udivr z19.s, p0/m, z19.s, z20.s
 ; CHECK-NEXT:    mov z20.d, z3.d
-; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
+; CHECK-NEXT:    ext z20.b, z20.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z20.h, z20.b
 ; CHECK-NEXT:    uunpklo z22.s, z20.h
-; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
+; CHECK-NEXT:    ext z20.b, z20.b, z0.b, #8
 ; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
 ; CHECK-NEXT:    mov z18.d, z4.d
 ; CHECK-NEXT:    uunpklo z20.s, z20.h
-; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
+; CHECK-NEXT:    ext z18.b, z18.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z18.h, z18.b
 ; CHECK-NEXT:    uunpklo z21.s, z18.h
-; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    ext z18.b, z18.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z18.s, z18.h
 ; CHECK-NEXT:    udivr z21.s, p0/m, z21.s, z22.s
 ; CHECK-NEXT:    uzp1 z22.h, z2.h, z2.h
@@ -1645,7 +1645,7 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
@@ -1725,15 +1725,15 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
 ; CHECK-NEXT:    mov z7.d, z3.d
 ; CHECK-NEXT:    uunpklo z16.s, z16.h
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    mov z6.d, z4.d
-; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uzp1 z16.h, z5.h, z5.h
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 7df362826d052..8b0bb7a7d4e15 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -309,11 +309,11 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    ucvtf z2.s, p0/m, z2.s
 ; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -485,14 +485,14 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
@@ -546,39 +546,39 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z5.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    mov z7.d, z3.d
-; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    mov z6.d, z2.d
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
+; CHECK-NEXT:    uunpklo z4.d, z4.s
+; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z6.d, z6.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
-; CHECK-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    uunpklo z7.d, z7.s
-; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    stp q5, q0, [x1, #64]
-; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    stp q1, q4, [x1]
 ; CHECK-NEXT:    movprfx z1, z6
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z6.d
 ; CHECK-NEXT:    movprfx z0, z7
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z7.d
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    stp q2, q1, [x1, #96]
+; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
@@ -1038,11 +1038,11 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -1722,11 +1722,11 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    scvtf z2.s, p0/m, z2.s
 ; CHECK-NEXT:    scvtf z3.s, p0/m, z3.s
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -1876,14 +1876,14 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
@@ -1937,39 +1937,39 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z5.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    mov z6.d, z2.d
-; CHECK-NEXT:    mov z7.d, z3.d
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    mov z6.d, z2.d
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
+; CHECK-NEXT:    sunpklo z4.d, z4.s
+; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z6.d, z6.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
-; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    sunpklo z7.d, z7.s
-; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    stp q5, q0, [x1, #64]
-; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    stp q1, q4, [x1]
 ; CHECK-NEXT:    movprfx z1, z6
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z6.d
 ; CHECK-NEXT:    movprfx z0, z7
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z7.d
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    stp q2, q1, [x1, #96]
+; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
@@ -2334,11 +2334,11 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x1, #32]
@@ -2390,35 +2390,35 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q5, q3, [x0]
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z7.d, z5.d
-; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z5.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpklo z4.d, z4.s
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z0.b, #8
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEXT:    sunpklo z7.d, z7.s
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
-; CHECK-NEXT:    stp q1, q4, [x1, #64]
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    scvtf z1.d, p0/m, z5.d
 ; CHECK-NEXT:    stp q0, q2, [x1, #96]
 ; CHECK-NEXT:    movprfx z0, z6
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z6.d
 ; CHECK-NEXT:    movprfx z2, z7
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z7.d
-; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
+; CHECK-NEXT:    stp q1, q4, [x1, #64]
+; CHECK-NEXT:    movprfx z1, z5
+; CHECK-NEXT:    scvtf z1.d, p0/m, z5.d
+; CHECK-NEXT:    stp q1, q2, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 688537704a6f7..93d6da19c0c33 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -37,32 +37,32 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-LABEL: reduce_uaddv_v16i8:
 ; STREAMING-SVE:       // %bb.0:
 ; STREAMING-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
-; STREAMING-SVE-NEXT:    uunpklo z2.h, z1.b
 ; STREAMING-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-SVE-NEXT:    uunpklo z2.h, z1.b
 ; STREAMING-SVE-NEXT:    uunpklo z3.h, z0.b
 ; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
-; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
+; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    uunpklo z1.h, z1.b
 ; STREAMING-SVE-NEXT:    uunpklo z0.h, z0.b
 ; STREAMING-SVE-NEXT:    uunpklo z4.s, z2.h
-; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; STREAMING-SVE-NEXT:    uunpklo z6.s, z3.h
-; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; STREAMING-SVE-NEXT:    mov z5.d, z1.d
+; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z0.b, #8
+; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    uunpklo z7.s, z0.h
+; STREAMING-SVE-NEXT:    uunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
+; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    uunpklo z2.s, z2.h
 ; STREAMING-SVE-NEXT:    uunpklo z3.s, z3.h
-; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
-; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z1.b, #8
-; STREAMING-SVE-NEXT:    uunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
+; STREAMING-SVE-NEXT:    uunpklo z5.s, z5.h
 ; STREAMING-SVE-NEXT:    uunpklo z0.s, z0.h
 ; STREAMING-SVE-NEXT:    add z2.s, z3.s, z2.s
-; STREAMING-SVE-NEXT:    uunpklo z5.s, z5.h
-; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
-; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
 ; STREAMING-SVE-NEXT:    add z1.s, z4.s, z1.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
 ; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
 ; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
@@ -104,32 +104,32 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-LABEL: reduce_saddv_v16i8:
 ; STREAMING-SVE:       // %bb.0:
 ; STREAMING-SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
-; STREAMING-SVE-NEXT:    sunpklo z2.h, z1.b
 ; STREAMING-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-SVE-NEXT:    sunpklo z2.h, z1.b
 ; STREAMING-SVE-NEXT:    sunpklo z3.h, z0.b
 ; STREAMING-SVE-NEXT:    ptrue p0.s, vl4
-; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z1.b, #8
+; STREAMING-SVE-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    sunpklo z1.h, z1.b
 ; STREAMING-SVE-NEXT:    sunpklo z0.h, z0.b
 ; STREAMING-SVE-NEXT:    sunpklo z4.s, z2.h
-; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; STREAMING-SVE-NEXT:    sunpklo z6.s, z3.h
-; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; STREAMING-SVE-NEXT:    mov z5.d, z1.d
+; STREAMING-SVE-NEXT:    ext z2.b, z2.b, z0.b, #8
+; STREAMING-SVE-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    sunpklo z7.s, z0.h
+; STREAMING-SVE-NEXT:    sunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
+; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; STREAMING-SVE-NEXT:    sunpklo z2.s, z2.h
 ; STREAMING-SVE-NEXT:    sunpklo z3.s, z3.h
-; STREAMING-SVE-NEXT:    add z4.s, z6.s, z4.s
-; STREAMING-SVE-NEXT:    ext z5.b, z5.b, z1.b, #8
-; STREAMING-SVE-NEXT:    sunpklo z1.s, z1.h
+; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
+; STREAMING-SVE-NEXT:    sunpklo z5.s, z5.h
 ; STREAMING-SVE-NEXT:    sunpklo z0.s, z0.h
 ; STREAMING-SVE-NEXT:    add z2.s, z3.s, z2.s
-; STREAMING-SVE-NEXT:    sunpklo z5.s, z5.h
-; STREAMING-SVE-NEXT:    add z1.s, z7.s, z1.s
-; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
 ; STREAMING-SVE-NEXT:    add z1.s, z4.s, z1.s
+; STREAMING-SVE-NEXT:    add z0.s, z0.s, z5.s
 ; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
 ; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
diff --git a/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll
new file mode 100644
index 0000000000000..b96fad8239190
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+bf16 -verify-machineinstrs %s -o - | FileCheck %s
+
+; This is a similar test to sve-fixed-length-extract-subvector.ll, but this one
+; uses SVE2 and extracts multiple subvectors at once to ensure that the ext
+; instruction is used (instead of just using smaller ld/st instructions with an
+; offset).
+
+; Test the patterns selecting EXT_ZZI and EXT_ZZI_B for fixed-length vectors
+; when SVE2 is available.
+
+;
+; Use NEON for 128-bit vectors
+;
+
+define void @extract_v4i32_halves(ptr %in, ptr %out, ptr %out2) {
+; CHECK-LABEL: extract_v4i32_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    str d1, [x1]
+; CHECK-NEXT:    str d0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <4 x i32>, ptr %in
+  %hi = shufflevector <4 x i32> %b, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  store <2 x i32> %hi, ptr %out
+  %lo = shufflevector <4 x i32> %b, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v4i32_half_unaligned(ptr %in, ptr %out) {
+; CHECK-LABEL: extract_v4i32_half_unaligned:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <4 x i32>, ptr %in
+  %d = shufflevector <4 x i32> %b, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %d, ptr %out
+  ret void
+}
+
+;
+; Use SVE for 256-bit vectors
+;
+
+define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v4i64_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <4 x i64>, ptr %in
+  %hi = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  store <2 x i64> %hi, ptr %out
+  %lo = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  store <2 x i64> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v4double_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <4 x double>, ptr %in
+  %hi = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 2, i32 3>
+  store <2 x double> %hi, ptr %out
+  %lo = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v8i32_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <8 x i32>, ptr %in
+  %hi = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i32> %hi, ptr %out
+  %lo = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %lo, ptr %out2
+  ret void
+}
+
+; Note that both the vector.extract intrinsic and the shufflevector from
+; the previous example get detected as a extract_subvector ISD node in
+; SelectionDAG. We'll test both cases for v8i32 for the sake of completeness,
+; but other types will just be tested using shufflevector.
+define void @extract_v8i32_halves_intrinsic(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v8i32_halves_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <8 x i32>, ptr %in
+  %hi = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %b, i64 4)
+  store <4 x i32> %hi, ptr %out
+  %lo = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %b, i64 0)
+  store <4 x i32> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v8float_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <8 x float>, ptr %in
+  %hi = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x float> %hi, ptr %out
+  %lo = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v8i32_half_unaligned(<8 x i32> %unused, ptr %in, ptr %out) vscale_range(2,2) {
+; CHECK-LABEL: extract_v8i32_half_unaligned:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <8 x i32>, ptr %in
+  %d = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i32> %d, ptr %out
+  ret void
+}
+
+define void @extract_v8i32_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr %out4) vscale_range(2,2) {
+; CHECK-LABEL: extract_v8i32_quarters:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #24
+; CHECK-NEXT:    str d1, [x1]
+; CHECK-NEXT:    str d2, [x2]
+; CHECK-NEXT:    str d0, [x3]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    str d0, [x4]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <8 x i32>, ptr %in
+  %hilo = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+  store <2 x i32> %hilo, ptr %out
+  %hihi = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> <i32 6, i32 7>
+  store <2 x i32> %hihi, ptr %out2
+  %lolo = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %lolo, ptr %out3
+  %lohi = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+  store <2 x i32> %lohi, ptr %out4
+  ret void
+}
+
+define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v16i16_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <16 x i16>, ptr %in
+  %hi = shufflevector <16 x i16> %b, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x i16> %hi, ptr %out
+  %lo = shufflevector <16 x i16> %b, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i16> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v16bfloat_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v16bfloat_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    str q1, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <16 x bfloat>, ptr %in
+  %hi = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x bfloat> %hi, ptr %out
+  %lo = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x bfloat> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v16half_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <16 x half>, ptr %in
+  %hi = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x half> %hi, ptr %out
+  %lo = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x half> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) {
+; CHECK-LABEL: extract_v32i8_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT:    str q1, [x1]
+; CHECK-NEXT:    str q0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <32 x i8>, ptr %in
+  %hi = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <16 x i8> %hi, ptr %out
+  %lo = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i8> %lo, ptr %out2
+  ret void
+}
+
+;
+; Use SVE for 512-bit vectors
+;
+
+define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) {
+; CHECK-LABEL: extract_v8i64_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <8 x i64>, ptr %in
+  %hi = shufflevector <8 x i64> %b, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i64> %hi, ptr %out
+  %lo = shufflevector <8 x i64> %b, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i64> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) {
+; CHECK-LABEL: extract_v16i32_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
+; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <16 x i32>, ptr %in
+  %hi = shufflevector <16 x i32> %b, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x i32> %hi, ptr %out
+  %lo = shufflevector <16 x i32> %b, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %lo, ptr %out2
+  ret void
+}
+
+define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) {
+; CHECK-LABEL: extract_v32i16_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <32 x i16>, ptr %in
+  %hi = shufflevector <32 x i16> %b, <32 x i16> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <16 x i16> %hi, ptr %out
+  %lo = shufflevector <32 x i16> %b, <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %lo, ptr %out2
+  ret void
+}
+
+
+define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) {
+; CHECK-LABEL: extract_v64i8_halves:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #32
+; CHECK-NEXT:    st1b { z1.b }, p0, [x1]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %b = load <64 x i8>, ptr %in
+  %hi = shufflevector <64 x i8> %b, <64 x i8> poison, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  store <32 x i8> %hi, ptr %out
+  %lo = shufflevector <64 x i8> %b, <64 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i8> %lo, ptr %out2
+  ret void
+}
+
+declare <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32>, i64)
+declare <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32>, i64)
+declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64)



More information about the llvm-commits mailing list