[llvm] 8812b6e - [AArch64][SVE][Fixed length] Fix div miscompile

Peter Waller via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 12 03:33:12 PST 2022


Author: Peter Waller
Date: 2022-12-12T11:31:02Z
New Revision: 8812b6eed7b2328d6e2739071f6460bfd47ac8e2

URL: https://github.com/llvm/llvm-project/commit/8812b6eed7b2328d6e2739071f6460bfd47ac8e2
DIFF: https://github.com/llvm/llvm-project/commit/8812b6eed7b2328d6e2739071f6460bfd47ac8e2.diff

LOG: [AArch64][SVE][Fixed length] Fix div miscompile

The prior code worked before SVE DIV was enabled 128 bit vectors.
With 128 bit vectors, when run on a 256 bit machine, it would split and
do a signed unpack, but this resulted in one full vector and one empty
vector with a half-sized predicate. The effect was that only half the
elements were treated correctly.

The fix is to bisect the vector, sign extend, do the division, truncate
and then concat.

Fixes #59357.

Differential Revision: https://reviews.llvm.org/D139618

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f79b0f16e656..6ed58ab5662d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22624,50 +22624,39 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
     return LowerToPredicatedOp(Op, DAG, PredOpcode);
 
   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
-  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
-  EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
-  EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
-
-  // If this is not a full vector, extend, div, and truncate it.
-  EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
-  if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
-    unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-    SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
-    SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
-    SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
+  EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
+  unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+  // If the wider type is legal: extend, op, and truncate.
+  EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+  if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
+    SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
+    SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
+    SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
   }
 
-  // Convert the operands to scalable vectors.
-  SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
-  SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+  auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
+                               &ExtendOpcode](SDValue Op) {
+    SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
+    SDValue IdxHalf =
+        DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
+    return std::pair<SDValue, SDValue>(
+        {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
+         DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
+  };
 
-  // Extend the scalable operands.
-  unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
-  unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
-  SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
-  SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
-  SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
-  SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
-
-  // Convert back to fixed vectors so the DIV can be further lowered.
-  Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
-  Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
-  Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
-  Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
-  SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
-                                 Op0Lo, Op1Lo);
-  SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
-                                 Op0Hi, Op1Hi);
-
-  // Convert again to scalable vectors to truncate.
-  ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
-  ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
-  SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
-                                       ResultLo, ResultHi);
-
-  return convertFromScalableVector(DAG, VT, ScalableResult);
+  // If wider type is not legal: split, extend, op, trunc and concat.
+  auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
+  auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
+  SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
+  SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
+  SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
+  SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
 }
 
 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index ca396356abe2..cfd755e20f12 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -18,13 +18,13 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
-; VBITS_GE_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; VBITS_GE_128-NEXT:    xtn v0.8b, v0.8h
 ; VBITS_GE_128-NEXT:    ret
 ;
@@ -94,29 +94,26 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT:    sunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT:    sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sunpklo z1.h, z1.b
-; VBITS_GE_128-NEXT:    sunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    sunpkhi z3.s, z1.h
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT:    uzp1 z1.h, z2.h, z4.h
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z3.h
-; VBITS_GE_128-NEXT:    uzp1 z0.b, z0.b, z1.b
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v16i8:
@@ -126,14 +123,19 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z2.b, z2.b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -206,15 +208,20 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
+; CHECK-NEXT:    st1b { z2.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -230,26 +237,41 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    sunpkhi z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    ptrue p2.h, vl64
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z5.s, z3.h
+; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT:    sunpkhi z5.s, z1.h
+; CHECK-NEXT:    sunpklo z5.s, z1.h
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    sunpklo z4.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sdiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    splice z3.h, p2, z3.h, z2.h
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    sdiv z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
+; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
+; CHECK-NEXT:    ptrue p1.b, vl128
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    splice z2.b, p1, z2.b, z0.b
+; CHECK-NEXT:    st1b { z2.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
   %op2 = load <256 x i8>, ptr %b
@@ -308,17 +330,14 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v8i16:
@@ -351,24 +370,25 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT:    ldp q3, q0, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sunpkhi z6.s, z0.h
-; VBITS_GE_128-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_128-NEXT:    ldp q3, q2, [x0]
-; VBITS_GE_128-NEXT:    sunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    sshll2 v4.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    sshll2 v7.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
 ; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    sdiv z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z3.s
-; VBITS_GE_128-NEXT:    sdivr z1.s, p0/m, z1.s, z2.s
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z5.h
-; VBITS_GE_128-NEXT:    uzp1 z1.h, z1.h, z4.h
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_128-NEXT:    movprfx z2, z7
+; VBITS_GE_128-NEXT:    sdiv z2.s, p0/m, z2.s, z6.s
+; VBITS_GE_128-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; VBITS_GE_128-NEXT:    stp q1, q0, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v16i16:
@@ -377,14 +397,19 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT:    sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z0.h
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: sdiv_v16i16:
@@ -450,14 +475,19 @@ define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
   %op2 = load <128 x i16>, ptr %b
@@ -724,13 +754,13 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
-; VBITS_GE_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; VBITS_GE_128-NEXT:    xtn v0.8b, v0.8h
 ; VBITS_GE_128-NEXT:    ret
 ;
@@ -800,29 +830,26 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT:    uunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT:    uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    uunpklo z1.h, z1.b
-; VBITS_GE_128-NEXT:    uunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    uunpkhi z3.s, z1.h
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT:    uzp1 z1.h, z2.h, z4.h
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z3.h
-; VBITS_GE_128-NEXT:    uzp1 z0.b, z0.b, z1.b
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
+; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v16i8:
@@ -832,14 +859,19 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z2.b, z2.b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -900,14 +932,19 @@ define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x1]
 ; CHECK-NEXT:    ld1b { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    uunpkhi z2.s, z0.h
-; CHECK-NEXT:    uunpkhi z3.s, z1.h
+; CHECK-NEXT:    uunpklo z2.s, z0.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    udivr z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
+; CHECK-NEXT:    st1b { z2.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -923,26 +960,41 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    uunpkhi z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z3.h, z0.b
+; CHECK-NEXT:    ptrue p2.h, vl64
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
+; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT:    uunpkhi z5.s, z1.h
+; CHECK-NEXT:    uunpklo z5.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    udiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    splice z3.h, p2, z3.h, z2.h
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    udiv z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
+; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
+; CHECK-NEXT:    ptrue p1.b, vl128
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    splice z2.b, p1, z2.b, z0.b
+; CHECK-NEXT:    st1b { z2.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
   %op2 = load <256 x i8>, ptr %b
@@ -1001,17 +1053,14 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v8i16:
@@ -1044,24 +1093,25 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @udiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT:    ldp q3, q0, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    uunpkhi z6.s, z0.h
-; VBITS_GE_128-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_128-NEXT:    ldp q3, q2, [x0]
-; VBITS_GE_128-NEXT:    uunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    ushll2 v4.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ushll2 v7.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
 ; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    udiv z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z3.s
-; VBITS_GE_128-NEXT:    udivr z1.s, p0/m, z1.s, z2.s
-; VBITS_GE_128-NEXT:    uzp1 z0.h, z0.h, z5.h
-; VBITS_GE_128-NEXT:    uzp1 z1.h, z1.h, z4.h
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_128-NEXT:    movprfx z2, z7
+; VBITS_GE_128-NEXT:    udiv z2.s, p0/m, z2.s, z6.s
+; VBITS_GE_128-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; VBITS_GE_128-NEXT:    stp q1, q0, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v16i16:
@@ -1070,14 +1120,19 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT:    uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z0.h
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: udiv_v16i16:
@@ -1134,14 +1189,19 @@ define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    uunpkhi z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
   %op2 = load <128 x i16>, ptr %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 9312711530e7..fb060f4dcae5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -18,13 +18,13 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-NEXT:    sshll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v3.8h, v0.8b, #0
-; VBITS_GE_128-NEXT:    sunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    xtn v2.8b, v2.8h
 ; VBITS_GE_128-NEXT:    mls v0.8b, v2.8b, v1.8b
 ; VBITS_GE_128-NEXT:    ret
@@ -97,30 +97,28 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT:    sunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT:    sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT:    sunpkhi z6.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sshll v6.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    sshll v7.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    sunpklo z3.h, z0.b
-; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT:    sunpkhi z6.s, z4.h
-; VBITS_GE_128-NEXT:    sunpkhi z7.s, z3.h
-; VBITS_GE_128-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_128-NEXT:    sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z5.h
-; VBITS_GE_128-NEXT:    uzp1 z3.h, z3.h, z6.h
-; VBITS_GE_128-NEXT:    uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v6.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v7.8h, #0
+; VBITS_GE_128-NEXT:    sshll v6.4s, v6.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    sshll v7.4s, v7.4h, #0
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
+; VBITS_GE_128-NEXT:    movprfx z4, z7
+; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
+; VBITS_GE_128-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
 ; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v16i8:
@@ -129,15 +127,20 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    sunpklo z2.h, z1.b
 ; VBITS_GE_256-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    sunpklo z4.s, z2.h
+; VBITS_GE_256-NEXT:    sunpklo z5.s, z3.h
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpkhi z4.s, z2.h
-; VBITS_GE_256-NEXT:    sunpkhi z5.s, z3.h
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    splice z4.h, p0, z4.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
@@ -218,14 +221,19 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
-; CHECK-NEXT:    sunpkhi z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z5.s, z3.h
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -243,26 +251,42 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    sunpkhi z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z3.h, z0.b
-; CHECK-NEXT:    sunpklo z4.h, z1.b
-; CHECK-NEXT:    sunpklo z5.h, z0.b
-; CHECK-NEXT:    sunpkhi z6.s, z2.h
-; CHECK-NEXT:    sunpkhi z7.s, z3.h
+; CHECK-NEXT:    ptrue p2.h, vl64
+; CHECK-NEXT:    sunpklo z2.h, z1.b
+; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    sunpklo z5.s, z2.h
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    sdivr z5.s, p1/m, z5.s, z6.s
+; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sdivr z6.s, p1/m, z6.s, z7.s
-; CHECK-NEXT:    sunpkhi z7.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #128
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    sunpkhi z3.s, z5.h
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #128
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z3.h, z4.b
+; CHECK-NEXT:    sunpklo z4.h, z6.b
+; CHECK-NEXT:    splice z5.h, p2, z5.h, z2.h
+; CHECK-NEXT:    sunpklo z2.s, z3.h
+; CHECK-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #128
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    sdiv z3.s, p1/m, z3.s, z7.s
-; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z6.s
+; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT:    splice z2.h, p2, z2.h, z3.h
+; CHECK-NEXT:    ptrue p1.b, vl128
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z4.b, p1, z4.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z4.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
@@ -323,19 +347,16 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT:    sunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v4.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    sunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT:    sshll v5.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    movprfx z3, z5
 ; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
 ; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v8i16:
@@ -370,26 +391,26 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sunpkhi z17.s, z2.h
-; VBITS_GE_128-NEXT:    ldp q3, q1, [x1]
-; VBITS_GE_128-NEXT:    sunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT:    sunpklo z7.s, z0.h
-; VBITS_GE_128-NEXT:    sunpkhi z16.s, z3.h
-; VBITS_GE_128-NEXT:    sdivr z16.s, p0/m, z16.s, z17.s
-; VBITS_GE_128-NEXT:    sunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT:    sunpklo z6.s, z1.h
-; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    sunpklo z5.s, z3.h
-; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT:    sunpklo z7.s, z2.h
-; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z7.s
-; VBITS_GE_128-NEXT:    uzp1 z4.h, z6.h, z4.h
-; VBITS_GE_128-NEXT:    uzp1 z5.h, z5.h, z16.h
-; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v3.8h
-; VBITS_GE_128-NEXT:    mls v0.8h, v4.8h, v1.8h
-; VBITS_GE_128-NEXT:    stp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v7.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    sshll2 v4.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v6.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    sshll v16.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    sshll2 v17.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    sshll v6.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
+; VBITS_GE_128-NEXT:    sshll v16.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z17.s
+; VBITS_GE_128-NEXT:    uzp1 v5.8h, v7.8h, v5.8h
+; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z16.s
+; VBITS_GE_128-NEXT:    uzp1 v4.8h, v6.8h, v4.8h
+; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v0.8h
+; VBITS_GE_128-NEXT:    mls v3.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v16i16:
@@ -398,14 +419,20 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT:    sunpkhi z3.s, z0.h
-; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    sunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT:    mov z3.d, z1.d
+; VBITS_GE_256-NEXT:    mov z4.d, z0.d
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z5.s, z0.h
-; VBITS_GE_256-NEXT:    movprfx z3, z5
-; VBITS_GE_256-NEXT:    sdiv z3.s, p1/m, z3.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z5.s
+; VBITS_GE_256-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -479,14 +506,20 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
-; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
+; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    sdiv z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -805,13 +838,13 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-NEXT:    ushll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v3.8h, v0.8b, #0
-; VBITS_GE_128-NEXT:    uunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    xtn v2.8b, v2.8h
 ; VBITS_GE_128-NEXT:    mls v0.8b, v2.8b, v1.8b
 ; VBITS_GE_128-NEXT:    ret
@@ -884,30 +917,28 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT:    uunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT:    uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT:    uunpkhi z6.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    ushll v6.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ushll v7.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    uunpklo z3.h, z0.b
-; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT:    uunpkhi z6.s, z4.h
-; VBITS_GE_128-NEXT:    uunpkhi z7.s, z3.h
-; VBITS_GE_128-NEXT:    uunpklo z4.s, z4.h
-; VBITS_GE_128-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT:    uzp1 z2.h, z2.h, z5.h
-; VBITS_GE_128-NEXT:    uzp1 z3.h, z3.h, z6.h
-; VBITS_GE_128-NEXT:    uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v6.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v7.8h, #0
+; VBITS_GE_128-NEXT:    ushll v6.4s, v6.4h, #0
+; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    ushll v7.4s, v7.4h, #0
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
+; VBITS_GE_128-NEXT:    movprfx z4, z7
+; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
+; VBITS_GE_128-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
 ; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v16i8:
@@ -916,15 +947,20 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    uunpklo z2.h, z1.b
 ; VBITS_GE_256-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    uunpklo z4.s, z2.h
+; VBITS_GE_256-NEXT:    uunpklo z5.s, z3.h
+; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpkhi z4.s, z2.h
-; VBITS_GE_256-NEXT:    uunpkhi z5.s, z3.h
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    splice z4.h, p0, z4.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
@@ -1005,14 +1041,19 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -1030,26 +1071,42 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    uunpkhi z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z3.h, z0.b
-; CHECK-NEXT:    uunpklo z4.h, z1.b
-; CHECK-NEXT:    uunpklo z5.h, z0.b
-; CHECK-NEXT:    uunpkhi z6.s, z2.h
-; CHECK-NEXT:    uunpkhi z7.s, z3.h
+; CHECK-NEXT:    ptrue p2.h, vl64
+; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    uunpklo z5.s, z2.h
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    udivr z5.s, p1/m, z5.s, z6.s
+; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    udivr z6.s, p1/m, z6.s, z7.s
-; CHECK-NEXT:    uunpkhi z7.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #128
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uunpkhi z3.s, z5.h
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #128
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z3.h, z4.b
+; CHECK-NEXT:    uunpklo z4.h, z6.b
+; CHECK-NEXT:    splice z5.h, p2, z5.h, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z3.h
+; CHECK-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #128
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    udiv z3.s, p1/m, z3.s, z7.s
-; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z6.s
+; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT:    splice z2.h, p2, z2.h, z3.h
+; CHECK-NEXT:    ptrue p1.b, vl128
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z4.b, p1, z4.b, z2.b
+; CHECK-NEXT:    mls z0.b, p0/m, z4.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
@@ -1110,19 +1167,16 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT:    uunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v4.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    uunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT:    ushll v5.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    movprfx z3, z5
 ; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
 ; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
-; VBITS_GE_128-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v8i16:
@@ -1157,26 +1211,26 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    uunpkhi z17.s, z2.h
-; VBITS_GE_128-NEXT:    ldp q3, q1, [x1]
-; VBITS_GE_128-NEXT:    uunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT:    uunpklo z7.s, z0.h
-; VBITS_GE_128-NEXT:    uunpkhi z16.s, z3.h
-; VBITS_GE_128-NEXT:    udivr z16.s, p0/m, z16.s, z17.s
-; VBITS_GE_128-NEXT:    uunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT:    uunpklo z6.s, z1.h
-; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    uunpklo z5.s, z3.h
-; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT:    uunpklo z7.s, z2.h
-; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z7.s
-; VBITS_GE_128-NEXT:    uzp1 z4.h, z6.h, z4.h
-; VBITS_GE_128-NEXT:    uzp1 z5.h, z5.h, z16.h
-; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v3.8h
-; VBITS_GE_128-NEXT:    mls v0.8h, v4.8h, v1.8h
-; VBITS_GE_128-NEXT:    stp q2, q0, [x0]
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v7.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ushll2 v4.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v6.4s, v2.8h, #0
+; VBITS_GE_128-NEXT:    ushll v16.4s, v2.4h, #0
+; VBITS_GE_128-NEXT:    ushll2 v17.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    ushll v6.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
+; VBITS_GE_128-NEXT:    ushll v16.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z17.s
+; VBITS_GE_128-NEXT:    uzp1 v5.8h, v7.8h, v5.8h
+; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z16.s
+; VBITS_GE_128-NEXT:    uzp1 v4.8h, v6.8h, v4.8h
+; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v0.8h
+; VBITS_GE_128-NEXT:    mls v3.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v16i16:
@@ -1185,14 +1239,20 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    uunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT:    uunpkhi z3.s, z0.h
-; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    uunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT:    mov z3.d, z1.d
+; VBITS_GE_256-NEXT:    mov z4.d, z0.d
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z5.s, z0.h
-; VBITS_GE_256-NEXT:    movprfx z3, z5
-; VBITS_GE_256-NEXT:    udiv z3.s, p1/m, z3.s, z4.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    uunpklo z4.s, z4.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z5.s
+; VBITS_GE_256-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
+; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -1266,14 +1326,20 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    uunpkhi z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
+; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    udiv z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index b22545526faf..b6547cef81b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -36,14 +36,19 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
@@ -55,26 +60,43 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpkhi z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpkhi z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z5.s, z3.h
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    sunpklo z5.s, z1.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpkhi z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z5.s, z0.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z5.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z2.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
@@ -84,48 +106,76 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 ; CHECK-LABEL: sdiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q3, q0, [x1]
+; CHECK-NEXT:    ldp q0, q2, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q1, [x0]
-; CHECK-NEXT:    sunpkhi z4.h, z0.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpkhi z16.s, z0.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpkhi z5.h, z1.b
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpkhi z7.s, z5.h
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    ptrue p2.b, vl8
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z5.h, z5.b
+; CHECK-NEXT:    sunpklo z7.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z4.h, z4.b
+; CHECK-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpkhi z5.s, z1.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z6.h
-; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    sunpkhi z1.h, z3.b
-; CHECK-NEXT:    sunpkhi z6.h, z2.b
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z16.s
-; CHECK-NEXT:    sunpkhi z7.s, z1.h
-; CHECK-NEXT:    sunpkhi z16.s, z6.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z6.s, z6.h
-; CHECK-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
-; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z6.s
-; CHECK-NEXT:    sunpkhi z6.s, z3.h
-; CHECK-NEXT:    sunpkhi z16.s, z2.h
+; CHECK-NEXT:    sunpklo z7.s, z3.h
+; CHECK-NEXT:    sunpklo z5.s, z2.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    splice z6.h, p1, z6.h, z4.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z16.s
+; CHECK-NEXT:    uzp1 z4.b, z6.b, z6.b
+; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z7.s
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z7.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT:    uzp1 z1.b, z2.b, z1.b
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z4.b
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    sunpklo z3.h, z1.b
+; CHECK-NEXT:    sunpklo z6.h, z0.b
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z7.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z2.s, z3.h
+; CHECK-NEXT:    sunpklo z7.s, z6.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z6.s, z6.h
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z6.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
+; CHECK-NEXT:    splice z2.b, p2, z2.b, z4.b
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
@@ -172,14 +222,21 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = sdiv <8 x i16> %op1, %op2
@@ -189,24 +246,34 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 ; CHECK-LABEL: sdiv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x1]
+; CHECK-NEXT:    ldp q3, q0, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpkhi z6.s, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sunpklo z4.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    sunpkhi z4.s, z1.h
+; CHECK-NEXT:    sunpklo z7.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpkhi z5.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z2.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpkhi z5.s, z3.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z4.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    sunpklo z2.s, z3.h
+; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    movprfx z2, z7
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
+; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b
@@ -331,14 +398,19 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
@@ -350,26 +422,43 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpkhi z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z3.h, z0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    uunpklo z5.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpkhi z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z5.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z2.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
@@ -379,48 +468,76 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 ; CHECK-LABEL: udiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q3, q0, [x1]
+; CHECK-NEXT:    ldp q0, q2, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q1, [x0]
-; CHECK-NEXT:    uunpkhi z4.h, z0.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpkhi z16.s, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpkhi z5.h, z1.b
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpkhi z7.s, z5.h
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    ptrue p2.b, vl8
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    uunpklo z5.h, z5.b
+; CHECK-NEXT:    uunpklo z7.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z4.h, z4.b
+; CHECK-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpkhi z5.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z6.h
-; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uunpkhi z1.h, z3.b
-; CHECK-NEXT:    uunpkhi z6.h, z2.b
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z16.s
-; CHECK-NEXT:    uunpkhi z7.s, z1.h
-; CHECK-NEXT:    uunpkhi z16.s, z6.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z6.s, z6.h
-; CHECK-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
-; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z6.s
-; CHECK-NEXT:    uunpkhi z6.s, z3.h
-; CHECK-NEXT:    uunpkhi z16.s, z2.h
+; CHECK-NEXT:    uunpklo z7.s, z3.h
+; CHECK-NEXT:    uunpklo z5.s, z2.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    splice z6.h, p1, z6.h, z4.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z16.s
+; CHECK-NEXT:    uzp1 z4.b, z6.b, z6.b
+; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z7.s
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z7.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT:    uzp1 z1.b, z2.b, z1.b
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z4.b
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    uunpklo z3.h, z1.b
+; CHECK-NEXT:    uunpklo z6.h, z0.b
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z7.s, z0.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z3.h
+; CHECK-NEXT:    uunpklo z7.s, z6.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z6.s, z6.h
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z6.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
+; CHECK-NEXT:    splice z2.b, p2, z2.b, z4.b
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
@@ -465,14 +582,21 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpkhi z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = udiv <8 x i16> %op1, %op2
@@ -482,24 +606,34 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 ; CHECK-LABEL: udiv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x1]
+; CHECK-NEXT:    ldp q3, q0, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpkhi z6.s, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    uunpkhi z4.s, z1.h
+; CHECK-NEXT:    uunpklo z7.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z5.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z2.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z4.h
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uunpklo z2.s, z3.h
+; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    movprfx z2, z7
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
+; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
   %op2 = load <16 x i16>, <16 x i16>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index e8fe9c33b8fd..f0f74e277ca4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -36,16 +36,21 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpkhi z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -58,27 +63,44 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    sunpkhi z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z3.h, z0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    sunpklo z5.s, z2.h
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpkhi z5.s, z2.h
-; CHECK-NEXT:    sunpkhi z6.s, z3.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z4.h, z1.b
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpkhi z7.s, z3.h
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    sunpklo z4.h, z1.b
+; CHECK-NEXT:    sunpklo z6.h, z0.b
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z4.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT:    sunpklo z2.s, z6.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    sunpklo z6.s, z6.h
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z6
 ; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z5.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z4.b
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -89,51 +111,81 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 ; CHECK-LABEL: srem_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    sunpkhi z5.h, z0.b
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    ldp q3, q2, [x1]
+; CHECK-NEXT:    mov z5.d, z0.d
 ; CHECK-NEXT:    sunpklo z7.h, z0.b
-; CHECK-NEXT:    sunpkhi z17.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z5.h, z5.b
+; CHECK-NEXT:    sunpklo z18.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    sunpkhi z4.h, z1.b
-; CHECK-NEXT:    sunpklo z6.h, z1.b
-; CHECK-NEXT:    sunpkhi z16.s, z4.h
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    sunpklo z6.h, z2.b
+; CHECK-NEXT:    ext z4.b, z4.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z16.s, z6.h
+; CHECK-NEXT:    sunpklo z4.h, z4.b
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z17.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpkhi z18.s, z6.h
+; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpkhi z5.s, z7.h
+; CHECK-NEXT:    sunpklo z18.s, z7.h
+; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    sunpklo z5.s, z6.h
+; CHECK-NEXT:    splice z17.h, p1, z17.h, z4.h
+; CHECK-NEXT:    sunpklo z4.s, z7.h
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT:    sunpklo z6.h, z6.b
+; CHECK-NEXT:    sunpklo z7.h, z7.b
+; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    uzp1 z5.h, z16.h, z16.h
+; CHECK-NEXT:    sunpklo z16.s, z6.h
+; CHECK-NEXT:    sunpklo z18.s, z7.h
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z18.s
+; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z5.h
-; CHECK-NEXT:    sunpkhi z6.h, z3.b
-; CHECK-NEXT:    sunpkhi z7.h, z2.b
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z16.h
-; CHECK-NEXT:    sunpkhi z16.s, z6.h
-; CHECK-NEXT:    sunpkhi z17.s, z7.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
+; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
+; CHECK-NEXT:    sunpklo z4.h, z3.b
+; CHECK-NEXT:    sunpklo z6.h, z1.b
+; CHECK-NEXT:    sunpklo z16.s, z4.h
+; CHECK-NEXT:    sunpklo z18.s, z6.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
-; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sunpklo z7.h, z3.b
-; CHECK-NEXT:    sunpklo z17.h, z2.b
-; CHECK-NEXT:    sunpkhi z18.s, z7.h
-; CHECK-NEXT:    sunpkhi z19.s, z17.h
-; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    sunpklo z17.s, z17.h
-; CHECK-NEXT:    sdivr z18.s, p0/m, z18.s, z19.s
-; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z17.s
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z16.h
-; CHECK-NEXT:    uzp1 z7.h, z7.h, z18.h
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    uzp1 z6.b, z7.b, z6.b
-; CHECK-NEXT:    uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT:    mls z2.b, p0/m, z6.b, z3.b
-; CHECK-NEXT:    mls z0.b, p0/m, z4.b, z1.b
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    splice z16.h, p1, z16.h, z4.h
+; CHECK-NEXT:    uzp1 z6.b, z17.b, z17.b
+; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z4.b, z7.b, z7.b
+; CHECK-NEXT:    uzp1 z7.b, z16.b, z16.b
+; CHECK-NEXT:    ptrue p1.b, vl16
+; CHECK-NEXT:    splice z7.b, p0, z7.b, z4.b
+; CHECK-NEXT:    splice z5.b, p0, z5.b, z6.b
+; CHECK-NEXT:    mls z1.b, p1/m, z7.b, z3.b
+; CHECK-NEXT:    mls z0.b, p1/m, z5.b, z2.b
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
@@ -165,17 +217,23 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z1.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
@@ -187,24 +245,38 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q2, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpkhi z17.s, z2.h
+; CHECK-NEXT:    ptrue p1.h, vl8
+; CHECK-NEXT:    mov z17.d, z2.d
+; CHECK-NEXT:    ext z17.b, z17.b, z2.b, #8
 ; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    sunpkhi z5.s, z0.h
+; CHECK-NEXT:    mov z5.d, z0.d
 ; CHECK-NEXT:    sunpklo z7.s, z0.h
-; CHECK-NEXT:    sunpkhi z16.s, z3.h
-; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT:    sunpkhi z4.s, z1.h
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    mov z16.d, z3.d
+; CHECK-NEXT:    ext z16.b, z16.b, z3.b, #8
+; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    sunpklo z6.s, z1.h
-; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sunpklo z7.s, z2.h
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z7.s
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z16.h
-; CHECK-NEXT:    uzp1 z4.h, z6.h, z4.h
-; CHECK-NEXT:    mls z2.h, p0/m, z5.h, z3.h
-; CHECK-NEXT:    mls z0.h, p0/m, z4.h, z1.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpklo z7.s, z16.h
+; CHECK-NEXT:    sunpklo z16.s, z17.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    movprfx z5, z16
+; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z7.s
+; CHECK-NEXT:    sunpklo z7.s, z3.h
+; CHECK-NEXT:    sunpklo z16.s, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z4.h
+; CHECK-NEXT:    mls z2.h, p1/m, z7.h, z3.h
+; CHECK-NEXT:    mls z0.h, p1/m, z5.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a
@@ -347,16 +419,21 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -369,27 +446,44 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    uunpkhi z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z3.h, z0.b
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    uunpklo z5.s, z2.h
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpkhi z5.s, z2.h
-; CHECK-NEXT:    uunpkhi z6.s, z3.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z4.h, z1.b
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpkhi z7.s, z3.h
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    uunpklo z4.h, z1.b
+; CHECK-NEXT:    uunpklo z6.h, z0.b
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z4.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z6.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    uunpklo z6.s, z6.h
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    movprfx z3, z6
 ; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z5.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z4.b
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -400,51 +494,81 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 ; CHECK-LABEL: urem_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    uunpkhi z5.h, z0.b
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    ldp q3, q2, [x1]
+; CHECK-NEXT:    mov z5.d, z0.d
 ; CHECK-NEXT:    uunpklo z7.h, z0.b
-; CHECK-NEXT:    uunpkhi z17.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z5.h, z5.b
+; CHECK-NEXT:    uunpklo z18.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    uunpkhi z4.h, z1.b
-; CHECK-NEXT:    uunpklo z6.h, z1.b
-; CHECK-NEXT:    uunpkhi z16.s, z4.h
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    uunpklo z6.h, z2.b
+; CHECK-NEXT:    ext z4.b, z4.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z16.s, z6.h
+; CHECK-NEXT:    uunpklo z4.h, z4.b
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z17.s, z4.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpkhi z18.s, z6.h
+; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpkhi z5.s, z7.h
+; CHECK-NEXT:    uunpklo z18.s, z7.h
+; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    uunpklo z5.s, z6.h
+; CHECK-NEXT:    splice z17.h, p1, z17.h, z4.h
+; CHECK-NEXT:    uunpklo z4.s, z7.h
+; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT:    uunpklo z6.h, z6.b
+; CHECK-NEXT:    uunpklo z7.h, z7.b
+; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    uzp1 z5.h, z16.h, z16.h
+; CHECK-NEXT:    uunpklo z16.s, z6.h
+; CHECK-NEXT:    uunpklo z18.s, z7.h
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z18.s
+; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z5.h
-; CHECK-NEXT:    uunpkhi z6.h, z3.b
-; CHECK-NEXT:    uunpkhi z7.h, z2.b
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z16.h
-; CHECK-NEXT:    uunpkhi z16.s, z6.h
-; CHECK-NEXT:    uunpkhi z17.s, z7.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
+; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
+; CHECK-NEXT:    uunpklo z4.h, z3.b
+; CHECK-NEXT:    uunpklo z6.h, z1.b
+; CHECK-NEXT:    uunpklo z16.s, z4.h
+; CHECK-NEXT:    uunpklo z18.s, z6.h
+; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
-; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    uunpklo z7.h, z3.b
-; CHECK-NEXT:    uunpklo z17.h, z2.b
-; CHECK-NEXT:    uunpkhi z18.s, z7.h
-; CHECK-NEXT:    uunpkhi z19.s, z17.h
-; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    uunpklo z17.s, z17.h
-; CHECK-NEXT:    udivr z18.s, p0/m, z18.s, z19.s
-; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z17.s
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z16.h
-; CHECK-NEXT:    uzp1 z7.h, z7.h, z18.h
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    uzp1 z6.b, z7.b, z6.b
-; CHECK-NEXT:    uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT:    mls z2.b, p0/m, z6.b, z3.b
-; CHECK-NEXT:    mls z0.b, p0/m, z4.b, z1.b
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    splice z16.h, p1, z16.h, z4.h
+; CHECK-NEXT:    uzp1 z6.b, z17.b, z17.b
+; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z4.b, z7.b, z7.b
+; CHECK-NEXT:    uzp1 z7.b, z16.b, z16.b
+; CHECK-NEXT:    ptrue p1.b, vl16
+; CHECK-NEXT:    splice z7.b, p0, z7.b, z4.b
+; CHECK-NEXT:    splice z5.b, p0, z5.b, z6.b
+; CHECK-NEXT:    mls z1.b, p1/m, z7.b, z3.b
+; CHECK-NEXT:    mls z0.b, p1/m, z5.b, z2.b
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, <32 x i8>* %a
   %op2 = load <32 x i8>, <32 x i8>* %b
@@ -476,17 +600,23 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpkhi z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    movprfx z3, z5
+; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
@@ -498,24 +628,38 @@ define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q2, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpkhi z17.s, z2.h
+; CHECK-NEXT:    ptrue p1.h, vl8
+; CHECK-NEXT:    mov z17.d, z2.d
+; CHECK-NEXT:    ext z17.b, z17.b, z2.b, #8
 ; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    mov z5.d, z0.d
 ; CHECK-NEXT:    uunpklo z7.s, z0.h
-; CHECK-NEXT:    uunpkhi z16.s, z3.h
-; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT:    uunpkhi z4.s, z1.h
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    mov z16.d, z3.d
+; CHECK-NEXT:    ext z16.b, z16.b, z3.b, #8
+; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    uunpklo z7.s, z2.h
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z7.s
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z16.h
-; CHECK-NEXT:    uzp1 z4.h, z6.h, z4.h
-; CHECK-NEXT:    mls z2.h, p0/m, z5.h, z3.h
-; CHECK-NEXT:    mls z0.h, p0/m, z4.h, z1.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpklo z7.s, z16.h
+; CHECK-NEXT:    uunpklo z16.s, z17.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    movprfx z5, z16
+; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z7.s
+; CHECK-NEXT:    uunpklo z7.s, z3.h
+; CHECK-NEXT:    uunpklo z16.s, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z4.h
+; CHECK-NEXT:    mls z2.h, p1/m, z7.h, z3.h
+; CHECK-NEXT:    mls z0.h, p1/m, z5.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, <16 x i16>* %a


        


More information about the llvm-commits mailing list