[llvm] [LLVM][DAGCombiner] Extend coverage for insert_subv(undef, extract_subv(A, 0), 0) (PR #95242)

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 12 06:01:38 PDT 2024


https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/95242

There is an existing combine to remove the need for extract_subv that requires matching vector types (all fixed or all scalable).

The combine doesn't need this restriction and so I've changed it to use ValueType's "knownBits??" interface that supports mixed vector types. In doing so we also need extra guards to prevent invalid operations (e.g. extracting a scalable vector from a fixed length vector).

>From fa1847077dfbc0b2a1dd8dae83fc8da26f86e02b Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Thu, 25 Apr 2024 16:39:05 +0100
Subject: [PATCH] [LLVM][DAGCombiner] Extend coverage for insert_subv(undef,
 extract_subv(A, 0), 0)

There is an existing combine to remove the need for extract_subv
that requires matching vector types (all fixed or all scalable).

The combine doesn't need this restriction and so I've changed it to
use ValueType's "knownBits??" interface that supports mixed vector
types. In doing so we also need extra guards to prevent invalid
operations (e.g. extracting a scalable vector from a fixed length
vector).
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   9 +-
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 254 ++++++-------
 ...sve-streaming-mode-fixed-length-int-div.ll | 310 ++++++++--------
 ...streaming-mode-fixed-length-int-extends.ll |  32 +-
 ...sve-streaming-mode-fixed-length-int-rem.ll | 342 +++++++++---------
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 134 ++++---
 ...treaming-mode-fixed-length-masked-store.ll |  19 +-
 .../AArch64/vector-insert-dag-combines.ll     | 310 ++++++++++++++++
 8 files changed, 846 insertions(+), 564 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4fcbe08e4b2b9..8ea09a5744322 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26417,12 +26417,13 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
       return N1.getOperand(0);
     // TODO: To remove the zero check, need to adjust the offset to
     // a multiple of the new src type.
-    if (isNullConstant(N2) &&
-        VT.isScalableVector() == SrcVT.isScalableVector()) {
-      if (VT.getVectorMinNumElements() >= SrcVT.getVectorMinNumElements())
+    if (isNullConstant(N2)) {
+      if (VT.knownBitsGE(SrcVT) &&
+          !(VT.isFixedLengthVector() && SrcVT.isScalableVector()))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
                            VT, N0, N1.getOperand(0), N2);
-      else
+      else if (VT.knownBitsLE(SrcVT) &&
+               !(VT.isScalableVector() && SrcVT.isFixedLengthVector()))
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
                            VT, N1.getOperand(0), N2);
     }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 0d92a6fa0fa28..a206fbc510295 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -519,32 +519,32 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z3.h, z0.h[2]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    fcvtzu x11, h3
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    fcvtzu x12, h0
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    fcvtzu x8, h1
+; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
 ; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    mov z2.h, z0.h[1]
+; CHECK-NEXT:    fcvtzu x10, h3
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    fcvtzu x11, h1
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    stp x8, x9, [sp, #-64]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    fcvtzu x8, h2
+; CHECK-NEXT:    fcvtzu x9, h3
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
 ; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    ldp q2, q3, [sp, #32]
-; CHECK-NEXT:    stp x12, x8, [sp]
-; CHECK-NEXT:    stp x10, x9, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q2, q3, [x1]
-; CHECK-NEXT:    stp q1, q0, [x1, #32]
+; CHECK-NEXT:    ldp q2, q3, [sp]
+; CHECK-NEXT:    stp x12, x8, [sp, #32]
+; CHECK-NEXT:    stp x10, x9, [sp, #48]
+; CHECK-NEXT:    ldp q1, q0, [sp, #32]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
 ;
@@ -598,55 +598,56 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
 ; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z4.h, z1.h[2]
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fcvtzu x11, h3
-; CHECK-NEXT:    fcvtzu x12, h4
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    fcvtzu x13, h1
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    mov z4.h, z2.h[1]
 ; CHECK-NEXT:    fcvtzu x8, h2
+; CHECK-NEXT:    mov z5.h, z2.h[3]
+; CHECK-NEXT:    mov z2.h, z2.h[2]
+; CHECK-NEXT:    fcvtzu x12, h3
 ; CHECK-NEXT:    fcvtzu x9, h4
-; CHECK-NEXT:    stp x12, x11, [sp, #48]
+; CHECK-NEXT:    mov z4.h, z3.h[1]
+; CHECK-NEXT:    fcvtzu x10, h5
+; CHECK-NEXT:    mov z5.h, z3.h[3]
+; CHECK-NEXT:    fcvtzu x11, h2
+; CHECK-NEXT:    mov z2.h, z3.h[2]
+; CHECK-NEXT:    stp x8, x9, [sp, #-128]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    fcvtzu x8, h4
+; CHECK-NEXT:    fcvtzu x9, h5
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    fcvtzu x10, h2
+; CHECK-NEXT:    mov z3.h, z1.h[1]
+; CHECK-NEXT:    mov z4.h, z1.h[3]
 ; CHECK-NEXT:    fcvtzu x11, h1
-; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    mov z1.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z0.h[1]
+; CHECK-NEXT:    stp x12, x8, [sp, #64]
 ; CHECK-NEXT:    fcvtzu x12, h3
-; CHECK-NEXT:    stp x13, x8, [sp]
-; CHECK-NEXT:    fcvtzu x8, h5
-; CHECK-NEXT:    stp x11, x9, [sp, #16]
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x11, h0
+; CHECK-NEXT:    fcvtzu x8, h4
+; CHECK-NEXT:    stp x10, x9, [sp, #80]
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x10, x12, [sp, #96]
-; CHECK-NEXT:    ldp q3, q4, [sp]
-; CHECK-NEXT:    fcvtzu x10, h1
-; CHECK-NEXT:    fcvtzu x12, h2
-; CHECK-NEXT:    stp x9, x8, [sp, #112]
+; CHECK-NEXT:    stp x11, x12, [sp, #32]
+; CHECK-NEXT:    fcvtzu x11, h2
+; CHECK-NEXT:    fcvtzu x12, h3
+; CHECK-NEXT:    stp x9, x8, [sp, #48]
 ; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    ldp q0, q1, [sp, #32]
-; CHECK-NEXT:    ldp q6, q7, [sp, #96]
-; CHECK-NEXT:    stp x11, x10, [sp, #64]
-; CHECK-NEXT:    stp x8, x12, [sp, #80]
-; CHECK-NEXT:    ldp q5, q2, [sp, #64]
-; CHECK-NEXT:    stp q0, q1, [x1]
-; CHECK-NEXT:    stp q3, q4, [x1, #32]
-; CHECK-NEXT:    stp q6, q7, [x1, #64]
-; CHECK-NEXT:    stp q5, q2, [x1, #96]
+; CHECK-NEXT:    ldp q0, q1, [sp]
+; CHECK-NEXT:    ldp q3, q4, [sp, #64]
+; CHECK-NEXT:    stp x10, x11, [sp, #96]
+; CHECK-NEXT:    ldp q6, q7, [sp, #32]
+; CHECK-NEXT:    stp x8, x12, [sp, #112]
+; CHECK-NEXT:    ldp q5, q2, [sp, #96]
+; CHECK-NEXT:    stp q0, q1, [x1, #32]
+; CHECK-NEXT:    stp q6, q7, [x1]
+; CHECK-NEXT:    stp q3, q4, [x1, #96]
+; CHECK-NEXT:    stp q5, q2, [x1, #64]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
 ;
@@ -2262,32 +2263,32 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z3.h, z0.h[2]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    fcvtzs x11, h3
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    fcvtzs x12, h0
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
 ; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    stp x11, x10, [sp, #48]
+; CHECK-NEXT:    mov z2.h, z0.h[1]
+; CHECK-NEXT:    fcvtzs x10, h3
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    stp x8, x9, [sp, #-64]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    fcvtzs x8, h2
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
 ; CHECK-NEXT:    fcvtzs x10, h0
-; CHECK-NEXT:    ldp q2, q3, [sp, #32]
-; CHECK-NEXT:    stp x12, x8, [sp]
-; CHECK-NEXT:    stp x10, x9, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q2, q3, [x1]
-; CHECK-NEXT:    stp q1, q0, [x1, #32]
+; CHECK-NEXT:    ldp q2, q3, [sp]
+; CHECK-NEXT:    stp x12, x8, [sp, #32]
+; CHECK-NEXT:    stp x10, x9, [sp, #48]
+; CHECK-NEXT:    ldp q1, q0, [sp, #32]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
 ;
@@ -2341,55 +2342,56 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
 ; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z4.h, z1.h[2]
-; CHECK-NEXT:    fcvtzs x8, h1
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    mov z5.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x10, h0
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    fcvtzs x11, h3
-; CHECK-NEXT:    fcvtzs x12, h4
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    mov z4.h, z1.h[3]
-; CHECK-NEXT:    fcvtzs x13, h1
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    mov z4.h, z2.h[1]
 ; CHECK-NEXT:    fcvtzs x8, h2
+; CHECK-NEXT:    mov z5.h, z2.h[3]
+; CHECK-NEXT:    mov z2.h, z2.h[2]
+; CHECK-NEXT:    fcvtzs x12, h3
 ; CHECK-NEXT:    fcvtzs x9, h4
-; CHECK-NEXT:    stp x12, x11, [sp, #48]
+; CHECK-NEXT:    mov z4.h, z3.h[1]
+; CHECK-NEXT:    fcvtzs x10, h5
+; CHECK-NEXT:    mov z5.h, z3.h[3]
+; CHECK-NEXT:    fcvtzs x11, h2
+; CHECK-NEXT:    mov z2.h, z3.h[2]
+; CHECK-NEXT:    stp x8, x9, [sp, #-128]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    fcvtzs x8, h4
+; CHECK-NEXT:    fcvtzs x9, h5
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    fcvtzs x10, h2
+; CHECK-NEXT:    mov z3.h, z1.h[1]
+; CHECK-NEXT:    mov z4.h, z1.h[3]
 ; CHECK-NEXT:    fcvtzs x11, h1
-; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    mov z1.h, z1.h[2]
+; CHECK-NEXT:    mov z2.h, z0.h[1]
+; CHECK-NEXT:    stp x12, x8, [sp, #64]
 ; CHECK-NEXT:    fcvtzs x12, h3
-; CHECK-NEXT:    stp x13, x8, [sp]
-; CHECK-NEXT:    fcvtzs x8, h5
-; CHECK-NEXT:    stp x11, x9, [sp, #16]
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x11, h0
+; CHECK-NEXT:    fcvtzs x8, h4
+; CHECK-NEXT:    stp x10, x9, [sp, #80]
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x10, x12, [sp, #96]
-; CHECK-NEXT:    ldp q3, q4, [sp]
-; CHECK-NEXT:    fcvtzs x10, h1
-; CHECK-NEXT:    fcvtzs x12, h2
-; CHECK-NEXT:    stp x9, x8, [sp, #112]
+; CHECK-NEXT:    stp x11, x12, [sp, #32]
+; CHECK-NEXT:    fcvtzs x11, h2
+; CHECK-NEXT:    fcvtzs x12, h3
+; CHECK-NEXT:    stp x9, x8, [sp, #48]
 ; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    ldp q0, q1, [sp, #32]
-; CHECK-NEXT:    ldp q6, q7, [sp, #96]
-; CHECK-NEXT:    stp x11, x10, [sp, #64]
-; CHECK-NEXT:    stp x8, x12, [sp, #80]
-; CHECK-NEXT:    ldp q5, q2, [sp, #64]
-; CHECK-NEXT:    stp q0, q1, [x1]
-; CHECK-NEXT:    stp q3, q4, [x1, #32]
-; CHECK-NEXT:    stp q6, q7, [x1, #64]
-; CHECK-NEXT:    stp q5, q2, [x1, #96]
+; CHECK-NEXT:    ldp q0, q1, [sp]
+; CHECK-NEXT:    ldp q3, q4, [sp, #64]
+; CHECK-NEXT:    stp x10, x11, [sp, #96]
+; CHECK-NEXT:    ldp q6, q7, [sp, #32]
+; CHECK-NEXT:    stp x8, x12, [sp, #112]
+; CHECK-NEXT:    ldp q5, q2, [sp, #96]
+; CHECK-NEXT:    stp q0, q1, [x1, #32]
+; CHECK-NEXT:    stp q6, q7, [x1]
+; CHECK-NEXT:    stp q3, q4, [x1, #96]
+; CHECK-NEXT:    stp q5, q2, [x1, #64]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index e09b1613a54af..233939f7285fa 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -127,19 +127,17 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
@@ -157,11 +155,11 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z1.h, p0, z1.h, z2.h
-; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -244,31 +242,28 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: sdiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q6, q2, [x0]
+; CHECK-NEXT:    ldp q6, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q7, q3, [x1]
-; CHECK-NEXT:    mov z1.d, z2.d
-; CHECK-NEXT:    mov z16.d, z6.d
-; CHECK-NEXT:    mov z0.d, z3.d
-; CHECK-NEXT:    ext z1.b, z1.b, z2.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z6.b, #8
-; CHECK-NEXT:    sunpklo z6.h, z6.b
-; CHECK-NEXT:    ext z0.b, z0.b, z3.b, #8
+; CHECK-NEXT:    ldr q2, [x0, #16]
+; CHECK-NEXT:    sunpklo z1.h, z3.b
+; CHECK-NEXT:    sunpklo z4.h, z2.b
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z7.h, z6.b
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z16.h, z16.b
-; CHECK-NEXT:    sunpklo z4.h, z0.b
-; CHECK-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z1.h
+; CHECK-NEXT:    sunpklo z5.s, z4.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z18.s, z16.h
-; CHECK-NEXT:    sunpklo z0.s, z4.h
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    sunpklo z17.s, z7.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    sunpklo z6.h, z6.b
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpklo z16.s, z16.h
+; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z5.s
-; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    sunpklo z4.h, z2.b
 ; CHECK-NEXT:    sunpklo z2.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
@@ -278,44 +273,44 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    mov z5.d, z7.d
-; CHECK-NEXT:    ext z5.b, z5.b, z7.b, #8
-; CHECK-NEXT:    sunpklo z7.h, z7.b
+; CHECK-NEXT:    ldr q5, [x0]
+; CHECK-NEXT:    sunpklo z16.h, z5.b
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    sunpklo z5.h, z5.b
-; CHECK-NEXT:    sunpklo z17.s, z5.h
+; CHECK-NEXT:    sunpklo z18.s, z16.h
+; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    sunpklo z16.s, z16.h
+; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    sunpklo z18.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    sunpklo z18.s, z6.h
+; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
+; CHECK-NEXT:    sunpklo z16.s, z6.h
 ; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z16.s
-; CHECK-NEXT:    sunpklo z16.s, z7.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    uzp1 z4.h, z17.h, z17.h
 ; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    uzp1 z6.h, z7.h, z7.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z4.h, z17.h, z17.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z6.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z0.b
+; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
-; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
-; CHECK-NEXT:    stp q3, q2, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    splice z1.b, p0, z1.b, z3.b
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z2.b
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v32i8:
@@ -542,20 +537,18 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -609,35 +602,31 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sunpklo z5.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
-; CHECK-NEXT:    stp q3, q0, [x0]
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z3.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sdiv_v16i16:
@@ -1054,19 +1043,17 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
@@ -1084,11 +1071,11 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z1.h, p0, z1.h, z2.h
-; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1171,31 +1158,28 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: udiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q6, q2, [x0]
+; CHECK-NEXT:    ldp q6, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q7, q3, [x1]
-; CHECK-NEXT:    mov z1.d, z2.d
-; CHECK-NEXT:    mov z16.d, z6.d
-; CHECK-NEXT:    mov z0.d, z3.d
-; CHECK-NEXT:    ext z1.b, z1.b, z2.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z6.b, #8
-; CHECK-NEXT:    uunpklo z6.h, z6.b
-; CHECK-NEXT:    ext z0.b, z0.b, z3.b, #8
+; CHECK-NEXT:    ldr q2, [x0, #16]
+; CHECK-NEXT:    uunpklo z1.h, z3.b
+; CHECK-NEXT:    uunpklo z4.h, z2.b
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z7.h, z6.b
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z16.h, z16.b
-; CHECK-NEXT:    uunpklo z4.h, z0.b
-; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z1.h
+; CHECK-NEXT:    uunpklo z5.s, z4.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z18.s, z16.h
-; CHECK-NEXT:    uunpklo z0.s, z4.h
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    uunpklo z17.s, z7.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    uunpklo z6.h, z6.b
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpklo z16.s, z16.h
+; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z5.s
-; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.h, z2.b
 ; CHECK-NEXT:    uunpklo z2.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
@@ -1205,44 +1189,44 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    mov z5.d, z7.d
-; CHECK-NEXT:    ext z5.b, z5.b, z7.b, #8
-; CHECK-NEXT:    uunpklo z7.h, z7.b
+; CHECK-NEXT:    ldr q5, [x0]
+; CHECK-NEXT:    uunpklo z16.h, z5.b
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uunpklo z5.h, z5.b
-; CHECK-NEXT:    uunpklo z17.s, z5.h
+; CHECK-NEXT:    uunpklo z18.s, z16.h
+; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    uunpklo z16.s, z16.h
+; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    uunpklo z18.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    uunpklo z18.s, z6.h
+; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
+; CHECK-NEXT:    uunpklo z16.s, z6.h
 ; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z16.s
-; CHECK-NEXT:    uunpklo z16.s, z7.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    uzp1 z4.h, z17.h, z17.h
 ; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    uzp1 z6.h, z7.h, z7.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z4.h, z17.h, z17.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z6.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z0.b
+; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
-; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
-; CHECK-NEXT:    stp q3, q2, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    splice z1.b, p0, z1.b, z3.b
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z2.b
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: udiv_v32i8:
@@ -1469,20 +1453,18 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1536,35 +1518,31 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    uunpklo z5.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
-; CHECK-NEXT:    stp q3, q0, [x0]
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z3.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: udiv_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index 2c2b79121ef82..d3ac1445e1086 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -1016,22 +1016,20 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    sunpklo z4.d, z2.s
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z6.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    mov z6.d, z0.d
+; CHECK-NEXT:    sunpklo z7.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    stp q4, q2, [x0]
-; CHECK-NEXT:    sunpklo z4.d, z7.s
-; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q6, q1, [x0, #32]
 ; CHECK-NEXT:    stp q5, q3, [x0, #64]
-; CHECK-NEXT:    sunpklo z2.d, z6.s
-; CHECK-NEXT:    stp q1, q4, [x0, #32]
-; CHECK-NEXT:    stp q0, q2, [x0, #96]
+; CHECK-NEXT:    stp q7, q0, [x0, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64:
@@ -2983,22 +2981,20 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    mov z7.d, z1.d
-; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z6.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    mov z6.d, z0.d
+; CHECK-NEXT:    uunpklo z7.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    stp q4, q2, [x0]
-; CHECK-NEXT:    uunpklo z4.d, z7.s
-; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q6, q1, [x0, #32]
 ; CHECK-NEXT:    stp q5, q3, [x0, #64]
-; CHECK-NEXT:    uunpklo z2.d, z6.s
-; CHECK-NEXT:    stp q1, q4, [x0, #32]
-; CHECK-NEXT:    stp q0, q2, [x0, #96]
+; CHECK-NEXT:    stp q7, q0, [x0, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index cb1fb20ec9d8d..dd6d2dcacd616 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -141,13 +141,9 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
@@ -155,13 +151,17 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpklo z5.h, z0.b
+; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z5.h, z5.b
 ; CHECK-NEXT:    sunpklo z7.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z5.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.h, z1.b
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z6.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
@@ -176,9 +176,9 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
-; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -279,57 +279,58 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    sunpklo z7.h, z1.b
-; CHECK-NEXT:    sunpklo z16.h, z0.b
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z6.s, z7.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    sunpklo z17.s, z16.h
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
-; CHECK-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    sunpklo z16.s, z16.h
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z17.s
-; CHECK-NEXT:    sunpklo z2.s, z4.h
-; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z3.h, z1.b
+; CHECK-NEXT:    sunpklo z4.h, z0.b
+; CHECK-NEXT:    sunpklo z2.s, z3.h
+; CHECK-NEXT:    sunpklo z5.s, z4.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    movprfx z5, z3
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z4.s
+; CHECK-NEXT:    movprfx z5, z4
+; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z7.h, z3.b
+; CHECK-NEXT:    sunpklo z16.h, z4.b
+; CHECK-NEXT:    sunpklo z3.s, z7.h
+; CHECK-NEXT:    sunpklo z4.s, z16.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z7.s, z7.h
+; CHECK-NEXT:    movprfx z6, z4
+; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldr q4, [x1]
-; CHECK-NEXT:    mov z18.d, z3.d
-; CHECK-NEXT:    mov z17.d, z4.d
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    ext z18.b, z18.b, z3.b, #8
-; CHECK-NEXT:    ext z17.b, z17.b, z4.b, #8
-; CHECK-NEXT:    sunpklo z18.h, z18.b
-; CHECK-NEXT:    sunpklo z17.h, z17.b
-; CHECK-NEXT:    sunpklo z20.s, z18.h
-; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    sunpklo z16.s, z16.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    sunpklo z17.h, z4.b
+; CHECK-NEXT:    sunpklo z18.h, z3.b
+; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    sunpklo z19.s, z17.h
+; CHECK-NEXT:    sunpklo z20.s, z18.h
 ; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
-; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
-; CHECK-NEXT:    sunpklo z18.s, z18.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
 ; CHECK-NEXT:    sunpklo z17.s, z17.h
+; CHECK-NEXT:    sunpklo z18.s, z18.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    sdivr z19.s, p0/m, z19.s, z20.s
-; CHECK-NEXT:    sunpklo z20.h, z3.b
+; CHECK-NEXT:    mov z20.d, z3.d
+; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    sunpklo z20.h, z20.b
 ; CHECK-NEXT:    sunpklo z22.s, z20.h
 ; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
-; CHECK-NEXT:    sunpklo z20.s, z20.h
 ; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    sunpklo z18.h, z4.b
+; CHECK-NEXT:    mov z18.d, z4.d
+; CHECK-NEXT:    sunpklo z20.s, z20.h
+; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
 ; CHECK-NEXT:    uzp1 z16.h, z19.h, z19.h
+; CHECK-NEXT:    sunpklo z18.h, z18.b
 ; CHECK-NEXT:    sunpklo z21.s, z18.h
 ; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
 ; CHECK-NEXT:    sunpklo z18.s, z18.h
@@ -347,14 +348,13 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    splice z6.b, p0, z6.b, z2.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z6.b
 ; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
-; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
+; CHECK-NEXT:    splice z5.b, p0, z5.b, z7.b
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    mls z0.b, p0/m, z6.b, z1.b
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    mls z3.b, p0/m, z5.b, z4.b
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: srem_v32i8:
@@ -588,23 +588,23 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpklo z4.s, z0.h
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -665,39 +665,38 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    sunpklo z16.s, z0.h
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sunpklo z5.s, z4.h
+; CHECK-NEXT:    mov z16.d, z0.d
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    sunpklo z7.s, z3.h
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z6.s, z6.h
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    sunpklo z16.s, z16.h
+; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEXT:    mov z6.d, z4.d
+; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sunpklo z7.s, z1.h
+; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    sunpklo z7.s, z7.h
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    splice z6.h, p0, z6.h, z5.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z6.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z2.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z7.h
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.h, p0/m, z6.h, z4.h
-; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    mls z3.h, p0/m, z5.h, z4.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: srem_v16i16:
@@ -1192,13 +1191,9 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
@@ -1206,13 +1201,17 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpklo z5.h, z0.b
+; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z5.h, z5.b
 ; CHECK-NEXT:    uunpklo z7.s, z5.h
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z5.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.h, z1.b
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z6.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
@@ -1227,9 +1226,9 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
-; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -1330,57 +1329,58 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    uunpklo z7.h, z1.b
-; CHECK-NEXT:    uunpklo z16.h, z0.b
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z6.s, z7.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    uunpklo z17.s, z16.h
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
-; CHECK-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    uunpklo z16.s, z16.h
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z17.s
-; CHECK-NEXT:    uunpklo z2.s, z4.h
-; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z3.h, z1.b
+; CHECK-NEXT:    uunpklo z4.h, z0.b
+; CHECK-NEXT:    uunpklo z2.s, z3.h
+; CHECK-NEXT:    uunpklo z5.s, z4.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    movprfx z5, z3
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z4.s
+; CHECK-NEXT:    movprfx z5, z4
+; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z7.h, z3.b
+; CHECK-NEXT:    uunpklo z16.h, z4.b
+; CHECK-NEXT:    uunpklo z3.s, z7.h
+; CHECK-NEXT:    uunpklo z4.s, z16.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z7.s, z7.h
+; CHECK-NEXT:    movprfx z6, z4
+; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldr q4, [x1]
-; CHECK-NEXT:    mov z18.d, z3.d
-; CHECK-NEXT:    mov z17.d, z4.d
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    ext z18.b, z18.b, z3.b, #8
-; CHECK-NEXT:    ext z17.b, z17.b, z4.b, #8
-; CHECK-NEXT:    uunpklo z18.h, z18.b
-; CHECK-NEXT:    uunpklo z17.h, z17.b
-; CHECK-NEXT:    uunpklo z20.s, z18.h
-; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    uunpklo z16.s, z16.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uunpklo z17.h, z4.b
+; CHECK-NEXT:    uunpklo z18.h, z3.b
+; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    uunpklo z19.s, z17.h
+; CHECK-NEXT:    uunpklo z20.s, z18.h
 ; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
-; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
-; CHECK-NEXT:    uunpklo z18.s, z18.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
 ; CHECK-NEXT:    uunpklo z17.s, z17.h
+; CHECK-NEXT:    uunpklo z18.s, z18.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    udivr z19.s, p0/m, z19.s, z20.s
-; CHECK-NEXT:    uunpklo z20.h, z3.b
+; CHECK-NEXT:    mov z20.d, z3.d
+; CHECK-NEXT:    ext z20.b, z20.b, z3.b, #8
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    uunpklo z20.h, z20.b
 ; CHECK-NEXT:    uunpklo z22.s, z20.h
 ; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
-; CHECK-NEXT:    uunpklo z20.s, z20.h
 ; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    uunpklo z18.h, z4.b
+; CHECK-NEXT:    mov z18.d, z4.d
+; CHECK-NEXT:    uunpklo z20.s, z20.h
+; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
 ; CHECK-NEXT:    uzp1 z16.h, z19.h, z19.h
+; CHECK-NEXT:    uunpklo z18.h, z18.b
 ; CHECK-NEXT:    uunpklo z21.s, z18.h
 ; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
 ; CHECK-NEXT:    uunpklo z18.s, z18.h
@@ -1398,14 +1398,13 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    splice z6.b, p0, z6.b, z2.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z6.b
 ; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
-; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
+; CHECK-NEXT:    splice z5.b, p0, z5.b, z7.b
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    mls z0.b, p0/m, z6.b, z1.b
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    mls z3.b, p0/m, z5.b, z4.b
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: urem_v32i8:
@@ -1639,23 +1638,23 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
 ;
@@ -1716,39 +1715,38 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    uunpklo z16.s, z0.h
-; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    uunpklo z5.s, z4.h
+; CHECK-NEXT:    mov z16.d, z0.d
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    uunpklo z7.s, z3.h
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z6.s, z6.h
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    uunpklo z16.s, z16.h
+; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEXT:    mov z6.d, z4.d
+; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    uunpklo z7.s, z1.h
+; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uunpklo z7.s, z7.h
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    splice z6.h, p0, z6.h, z5.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z6.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z2.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z7.h
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.h, p0/m, z6.h, z4.h
-; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    mls z3.h, p0/m, z5.h, z4.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: urem_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index fd2d9a8fb80d1..afd3bb7161c15 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -492,19 +492,18 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    stp q1, q3, [x1]
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    stp q2, q1, [x1]
+; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64:
@@ -568,42 +567,42 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    mov z4.d, z2.d
-; CHECK-NEXT:    mov z7.d, z3.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    ext z4.b, z4.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    mov z5.d, z3.d
+; CHECK-NEXT:    uunpklo z4.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
+; CHECK-NEXT:    mov z7.d, z1.d
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z4.d, z4.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
+; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    mov z6.d, z2.d
+; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    uunpklo z7.d, z7.s
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    uunpklo z5.d, z5.s
-; CHECK-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z7.d, z7.s
 ; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
-; CHECK-NEXT:    stp q2, q4, [x1, #64]
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    ucvtf z2.d, p0/m, z6.d
-; CHECK-NEXT:    stp q0, q5, [x1, #96]
-; CHECK-NEXT:    movprfx z0, z7
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z7.d
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
-; CHECK-NEXT:    stp q3, q0, [x1]
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NEXT:    stp q4, q0, [x1, #64]
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z7.d
+; CHECK-NEXT:    movprfx z0, z6
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z6.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64:
@@ -1954,19 +1953,18 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sunpklo z1.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    mov z3.d, z1.d
-; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    sunpklo z2.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    stp q1, q3, [x1]
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    stp q2, q1, [x1]
+; CHECK-NEXT:    stp q3, q0, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64:
@@ -2026,42 +2024,42 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    sunpklo z2.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    mov z4.d, z2.d
-; CHECK-NEXT:    mov z7.d, z3.d
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    ext z4.b, z4.b, z2.b, #8
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    mov z5.d, z3.d
+; CHECK-NEXT:    sunpklo z4.d, z0.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    ext z5.b, z5.b, z3.b, #8
+; CHECK-NEXT:    mov z7.d, z1.d
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
+; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    mov z6.d, z2.d
+; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    sunpklo z5.d, z5.s
-; CHECK-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    ext z6.b, z6.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z7.d, z7.s
 ; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
-; CHECK-NEXT:    stp q2, q4, [x1, #64]
-; CHECK-NEXT:    movprfx z2, z6
-; CHECK-NEXT:    scvtf z2.d, p0/m, z6.d
-; CHECK-NEXT:    stp q0, q5, [x1, #96]
-; CHECK-NEXT:    movprfx z0, z7
-; CHECK-NEXT:    scvtf z0.d, p0/m, z7.d
-; CHECK-NEXT:    stp q1, q2, [x1, #32]
-; CHECK-NEXT:    stp q3, q0, [x1]
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    stp q4, q0, [x1, #64]
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    stp q3, q5, [x1]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    scvtf z3.d, p0/m, z7.d
+; CHECK-NEXT:    movprfx z0, z6
+; CHECK-NEXT:    scvtf z0.d, p0/m, z6.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #96]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index a79ce9db9abfd..13b83d2ae3f07 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -963,21 +963,20 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    lsl z1.h, z1.h, #15
-; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
-; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    mov z1.h, #0 // =0x0
-; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    lsl z0.h, z0.h, #15
+; CHECK-NEXT:    asr z0.h, z0.h, #15
+; CHECK-NEXT:    cmpne p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0, x8, lsl #1]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: masked_store_v16f16:
diff --git a/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll b/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll
new file mode 100644
index 0000000000000..c63808ed48385
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll
@@ -0,0 +1,310 @@
+; RUN: llc -debug-only=isel -o /dev/null < %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; NOTE: Due to their nature the expected inserts and extracts often emit no
+; instructions and so these tests verify the output of DAGCombiner directly.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: Initial selection DAG: %bb.0 'insert_small_fixed_into_big_fixed:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0
+; CHECK:       t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t6: v16i8 = insert_subvector undef:v16i8, t4, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t6
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'insert_small_fixed_into_big_fixed:'
+; CHECK: SelectionDAG has 9 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:       t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0
+; CHECK:     t10: v16i8 = insert_subvector undef:v16i8, t2, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t10
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1
+
+define <16 x i8> @insert_small_fixed_into_big_fixed(<8 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<8 x i8> %a, i64 0)
+  %insert = call <16 x i8> @llvm.vector.insert(<16 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <16 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'insert_small_fixed_into_big_scalable:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0
+; CHECK:       t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t6: nxv16i8 = insert_subvector undef:nxv16i8, t4, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t6
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'insert_small_fixed_into_big_scalable:'
+; CHECK: SelectionDAG has 9 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:       t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0
+; CHECK:     t10: nxv16i8 = insert_subvector undef:nxv16i8, t2, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t10
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1
+
+define <vscale x 16 x i8> @insert_small_fixed_into_big_scalable(<8 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<8 x i8> %a, i64 0)
+  %insert = call <vscale x 16 x i8> @llvm.vector.insert(<vscale x 16 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <vscale x 16 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'insert_small_scalable_into_big_fixed:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0
+; CHECK:         t3: nxv8i8 = truncate t2
+; CHECK:       t5: v4i8 = extract_subvector t3, Constant:i64<0>
+; CHECK:     t7: v16i8 = insert_subvector undef:v16i8, t5, Constant:i64<0>
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:v16i8 $q0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:v16i8 $q0, t9:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'insert_small_scalable_into_big_fixed:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0
+; CHECK:         t3: nxv8i8 = truncate t2
+; CHECK:       t5: v4i8 = extract_subvector t3, Constant:i64<0>
+; CHECK:     t7: v16i8 = insert_subvector undef:v16i8, t5, Constant:i64<0>
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:v16i8 $q0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:v16i8 $q0, t9:1
+
+; Resulting insert would not be legal, so there's no transformation.
+define <16 x i8> @insert_small_scalable_into_big_fixed(<vscale x 8 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<vscale x 8 x i8> %a, i64 0)
+  %insert = call <16 x i8> @llvm.vector.insert(<16 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <16 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'insert_small_scalable_into_big_scalable_1:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0
+; CHECK:         t3: nxv8i8 = truncate t2
+; CHECK:       t5: v4i8 = extract_subvector t3, Constant:i64<0>
+; CHECK:     t7: nxv16i8 = insert_subvector undef:nxv16i8, t5, Constant:i64<0>
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'insert_small_scalable_into_big_scalable_1:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0
+; CHECK:       t3: nxv8i8 = truncate t2
+; CHECK:     t11: nxv16i8 = insert_subvector undef:nxv16i8, t3, Constant:i64<0>
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t11
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1
+
+define <vscale x 16 x i8> @insert_small_scalable_into_big_scalable_1(<vscale x 8 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<vscale x 8 x i8> %a, i64 0)
+  %insert = call <vscale x 16 x i8> @llvm.vector.insert(<vscale x 16 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <vscale x 16 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'insert_small_scalable_into_big_scalable_2:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0
+; CHECK:         t3: nxv8i8 = truncate t2
+; CHECK:       t5: nxv4i8 = extract_subvector t3, Constant:i64<0>
+; CHECK:     t7: nxv16i8 = insert_subvector undef:nxv16i8, t5, Constant:i64<0>
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'insert_small_scalable_into_big_scalable_2:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0
+; CHECK:       t3: nxv8i8 = truncate t2
+; CHECK:     t11: nxv16i8 = insert_subvector undef:nxv16i8, t3, Constant:i64<0>
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t11
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1
+
+define <vscale x 16 x i8> @insert_small_scalable_into_big_scalable_2(<vscale x 8 x i8> %a) #0 {
+  %extract = call <vscale x 4 x i8> @llvm.vector.extract(<vscale x 8 x i8> %a, i64 0)
+  %insert = call <vscale x 16 x i8> @llvm.vector.insert(<vscale x 16 x i8> undef, <vscale x 4 x i8> %extract, i64 0)
+  ret <vscale x 16 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'extract_small_fixed_from_big_fixed:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0
+; CHECK:       t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t6: v8i8 = insert_subvector undef:v8i8, t4, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v8i8 $d0, t6
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v8i8 $d0, t8:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'extract_small_fixed_from_big_fixed:'
+; CHECK: SelectionDAG has 8 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:       t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0
+; CHECK:     t10: v8i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v8i8 $d0, t10
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v8i8 $d0, t8:1
+
+define <8 x i8> @extract_small_fixed_from_big_fixed(<16 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<16 x i8> %a, i64 0)
+  %insert = call <8 x i8> @llvm.vector.insert(<8 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <8 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'extract_small_scalable_from_big_fixed:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0
+; CHECK:         t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:       t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0>
+; CHECK:     t7: nxv8i16 = any_extend t6
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'extract_small_scalable_from_big_fixed:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0
+; CHECK:         t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:       t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0>
+; CHECK:     t7: nxv8i16 = any_extend t6
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1
+
+; Resulting insert would not be legal, so there's no transformation.
+define <vscale x 8 x i8> @extract_small_scalable_from_big_fixed(<16 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<16 x i8> %a, i64 0)
+  %insert = call <vscale x 8 x i8> @llvm.vector.insert(<vscale x 8 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <vscale x 8 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'extract_small_fixed_from_big_scalable:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:       t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t6: v8i8 = insert_subvector undef:v8i8, t4, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v8i8 $d0, t6
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v8i8 $d0, t8:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'extract_small_fixed_from_big_scalable:'
+; CHECK: SelectionDAG has 8 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:       t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:     t10: v8i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v8i8 $d0, t10
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v8i8 $d0, t8:1
+
+define <8 x i8> @extract_small_fixed_from_big_scalable(<vscale x 16 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<vscale x 16 x i8> %a, i64 0)
+  %insert = call <8 x i8> @llvm.vector.insert(<8 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <8 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'extract_small_scalable_from_big_scalable_1:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:         t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:       t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0>
+; CHECK:     t7: nxv8i16 = any_extend t6
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'extract_small_scalable_from_big_scalable_1:'
+; CHECK: SelectionDAG has 9 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:       t11: nxv8i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t7: nxv8i16 = any_extend t11
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1
+
+define <vscale x 8 x i8> @extract_small_scalable_from_big_scalable_1(<vscale x 16 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<vscale x 16 x i8> %a, i64 0)
+  %insert = call <vscale x 8 x i8> @llvm.vector.insert(<vscale x 8 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <vscale x 8 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'extract_small_scalable_from_big_scalable_2:'
+; CHECK: SelectionDAG has 11 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:           t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:         t4: nxv4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:       t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0>
+; CHECK:     t7: nxv8i16 = any_extend t6
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'extract_small_scalable_from_big_scalable_2:'
+; CHECK: SelectionDAG has 9 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:       t11: nxv8i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t7: nxv8i16 = any_extend t11
+; CHECK:   t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7
+; CHECK:   t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1
+
+define <vscale x 8 x i8> @extract_small_scalable_from_big_scalable_2(<vscale x 16 x i8> %a) #0 {
+  %extract = call <vscale x 4 x i8> @llvm.vector.extract(<vscale x 16 x i8> %a, i64 0)
+  %insert = call <vscale x 8 x i8> @llvm.vector.insert(<vscale x 8 x i8> undef, <vscale x 4 x i8> %extract, i64 0)
+  ret <vscale x 8 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'extract_fixed_from_scalable:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:       t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t6: v16i8 = insert_subvector undef:v16i8, t4, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t6
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'extract_fixed_from_scalable:'
+; CHECK: SelectionDAG has 8 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:       t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0
+; CHECK:     t10: v16i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t10
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1
+
+; A variant of insert_small_scalable_into_big_fixed whose vector types prevent
+; the expected transformation because the resulting insert would not be legal.
+; In this instance their matching minimum vector lengths allow us to perform the
+; opposite transformation and emit an extract instead.
+define <16 x i8> @extract_fixed_from_scalable(<vscale x 16 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<vscale x 16 x i8> %a, i64 0)
+  %insert = call <16 x i8> @llvm.vector.insert(<16 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <16 x i8> %insert
+}
+
+; CHECK: Initial selection DAG: %bb.0 'insert_fixed_into_scalable:'
+; CHECK: SelectionDAG has 10 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:         t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0
+; CHECK:       t4: v4i8 = extract_subvector t2, Constant:i64<0>
+; CHECK:     t6: nxv16i8 = insert_subvector undef:nxv16i8, t4, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t6
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1
+
+; CHECK: Optimized lowered selection DAG: %bb.0 'insert_fixed_into_scalable:'
+; CHECK: SelectionDAG has 9 nodes:
+; CHECK:   t0: ch,glue = EntryToken
+; CHECK:       t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0
+; CHECK:     t10: nxv16i8 = insert_subvector undef:nxv16i8, t2, Constant:i64<0>
+; CHECK:   t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t10
+; CHECK:   t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1
+
+; A variant of extract_small_scalable_from_big_fixed whose vector types prevent
+; the expected transformation because the resulting extract would not be legal.
+; In this instance their matching minimum vector lengths allow us to perform the
+; opposite transformation and emit an insert instead.
+define <vscale x 16 x i8> @insert_fixed_into_scalable(<16 x i8> %a) #0 {
+  %extract = call <4 x i8> @llvm.vector.extract(<16 x i8> %a, i64 0)
+  %insert = call <vscale x 16 x i8> @llvm.vector.insert(<vscale x 16 x i8> undef, <4 x i8> %extract, i64 0)
+  ret <vscale x 16 x i8> %insert
+}
+
+attributes #0 = { "target-features"="+sve" }



More information about the llvm-commits mailing list