[llvm] 8812b6e - [AArch64][SVE][Fixed length] Fix div miscompile
Peter Waller via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 12 03:33:12 PST 2022
Author: Peter Waller
Date: 2022-12-12T11:31:02Z
New Revision: 8812b6eed7b2328d6e2739071f6460bfd47ac8e2
URL: https://github.com/llvm/llvm-project/commit/8812b6eed7b2328d6e2739071f6460bfd47ac8e2
DIFF: https://github.com/llvm/llvm-project/commit/8812b6eed7b2328d6e2739071f6460bfd47ac8e2.diff
LOG: [AArch64][SVE][Fixed length] Fix div miscompile
The prior code worked before SVE DIV was enabled 128 bit vectors.
With 128 bit vectors, when run on a 256 bit machine, it would split and
do a signed unpack, but this resulted in one full vector and one empty
vector with a half-sized predicate. The effect was that only half the
elements were treated correctly.
The fix is to bisect the vector, sign extend, do the division, truncate
and then concat.
Fixes #59357.
Differential Revision: https://reviews.llvm.org/D139618
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f79b0f16e656..6ed58ab5662d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22624,50 +22624,39 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
return LowerToPredicatedOp(Op, DAG, PredOpcode);
// Scalable vector i8/i16 DIV is not supported. Promote it to i32.
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
- EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
- EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
-
- // If this is not a full vector, extend, div, and truncate it.
- EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
- if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
- unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
- SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
- SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
+ EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
+ unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ // If the wider type is legal: extend, op, and truncate.
+ EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+ if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
+ SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
+ SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
+ SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
}
- // Convert the operands to scalable vectors.
- SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
- SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+ auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
+ &ExtendOpcode](SDValue Op) {
+ SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
+ SDValue IdxHalf =
+ DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
+ return std::pair<SDValue, SDValue>(
+ {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
+ DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
+ };
- // Extend the scalable operands.
- unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
- unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
- SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
- SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
- SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
- SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
-
- // Convert back to fixed vectors so the DIV can be further lowered.
- Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
- Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
- Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
- Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
- SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
- Op0Lo, Op1Lo);
- SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
- Op0Hi, Op1Hi);
-
- // Convert again to scalable vectors to truncate.
- ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
- ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
- SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
- ResultLo, ResultHi);
-
- return convertFromScalableVector(DAG, VT, ScalableResult);
+ // If wider type is not legal: split, extend, op, trunc and concat.
+ auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
+ auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
+ SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
+ SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
+ SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
+ SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
}
SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index ca396356abe2..cfd755e20f12 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -18,13 +18,13 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
-; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
; VBITS_GE_128-NEXT: ret
;
@@ -94,29 +94,26 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: sdiv_v16i8:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: sunpklo z1.h, z1.b
-; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: sunpkhi z3.s, z1.h
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: sshll2 v3.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h
-; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h
+; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v16i8:
@@ -126,14 +123,19 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z2.b, z2.b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
@@ -206,15 +208,20 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl64
+; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h
; CHECK-NEXT: ptrue p0.h, vl128
-; CHECK-NEXT: st1b { z0.h }, p0, [x0]
+; CHECK-NEXT: st1b { z2.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
@@ -230,26 +237,41 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT: sunpkhi z2.h, z1.b
-; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: ptrue p2.h, vl64
+; CHECK-NEXT: sunpklo z2.h, z1.b
+; CHECK-NEXT: sunpklo z3.h, z0.b
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: sunpklo z4.s, z2.h
+; CHECK-NEXT: sunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: sunpkhi z4.s, z2.h
-; CHECK-NEXT: sunpkhi z5.s, z3.h
+; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT: sunpkhi z5.s, z1.h
+; CHECK-NEXT: sunpklo z5.s, z1.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT: sunpklo z4.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
-; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: splice z3.h, p2, z3.h, z2.h
+; CHECK-NEXT: movprfx z2, z4
+; CHECK-NEXT: sdiv z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b
+; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h
+; CHECK-NEXT: ptrue p1.b, vl128
+; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: splice z2.b, p1, z2.b, z0.b
+; CHECK-NEXT: st1b { z2.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
@@ -308,17 +330,14 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: sdiv_v8i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v8i16:
@@ -351,24 +370,25 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: sdiv_v16i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT: ldp q3, q0, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: sunpkhi z6.s, z0.h
-; VBITS_GE_128-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_128-NEXT: ldp q3, q2, [x0]
-; VBITS_GE_128-NEXT: sunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT: sshll2 v4.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: sshll2 v7.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s
; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z3.s
-; VBITS_GE_128-NEXT: sdivr z1.s, p0/m, z1.s, z2.s
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h
-; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z4.h
-; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_128-NEXT: movprfx z2, z7
+; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z6.s
+; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h
+; VBITS_GE_128-NEXT: stp q1, q0, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: sdiv_v16i16:
@@ -377,14 +397,19 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h
+; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sdiv_v16i16:
@@ -450,14 +475,19 @@ define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z2.h, p1, z2.h, z0.h
+; CHECK-NEXT: st1h { z2.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
@@ -724,13 +754,13 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
-; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h
; VBITS_GE_128-NEXT: ret
;
@@ -800,29 +830,26 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: udiv_v16i8:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: uunpklo z1.h, z1.b
-; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z0.h, z0.b
+; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: uunpkhi z3.s, z1.h
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: ushll2 v3.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT: uzp1 z1.h, z2.h, z4.h
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z3.h
-; VBITS_GE_128-NEXT: uzp1 z0.b, z0.b, z1.b
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h
+; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v16i8:
@@ -832,14 +859,19 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h
+; VBITS_GE_256-NEXT: uzp1 z0.b, z2.b, z2.b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
;
@@ -900,14 +932,19 @@ define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1]
; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0]
-; CHECK-NEXT: uunpkhi z2.s, z0.h
-; CHECK-NEXT: uunpkhi z3.s, z1.h
+; CHECK-NEXT: uunpklo z2.s, z0.h
+; CHECK-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT: st1b { z0.h }, p0, [x0]
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z2.h, p1, z2.h, z0.h
+; CHECK-NEXT: st1b { z2.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
@@ -923,26 +960,41 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT: uunpkhi z2.h, z1.b
-; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: ptrue p2.h, vl64
+; CHECK-NEXT: uunpklo z2.h, z1.b
+; CHECK-NEXT: uunpklo z3.h, z0.b
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT: uunpklo z4.s, z2.h
+; CHECK-NEXT: uunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z4.s, z2.h
-; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT: uunpkhi z5.s, z1.h
+; CHECK-NEXT: uunpklo z5.s, z1.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT: uunpklo z4.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
-; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-NEXT: splice z3.h, p2, z3.h, z2.h
+; CHECK-NEXT: movprfx z2, z4
+; CHECK-NEXT: udiv z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b
+; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h
+; CHECK-NEXT: ptrue p1.b, vl128
+; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: splice z2.b, p1, z2.b, z0.b
+; CHECK-NEXT: st1b { z2.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
@@ -1001,17 +1053,14 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: udiv_v8i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v8i16:
@@ -1044,24 +1093,25 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @udiv_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: udiv_v16i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: ldp q0, q1, [x1]
+; VBITS_GE_128-NEXT: ldp q3, q0, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: uunpkhi z6.s, z0.h
-; VBITS_GE_128-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_128-NEXT: ldp q3, q2, [x0]
-; VBITS_GE_128-NEXT: uunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
+; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT: ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT: ushll2 v4.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: ushll2 v7.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s
; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z3.s
-; VBITS_GE_128-NEXT: udivr z1.s, p0/m, z1.s, z2.s
-; VBITS_GE_128-NEXT: uzp1 z0.h, z0.h, z5.h
-; VBITS_GE_128-NEXT: uzp1 z1.h, z1.h, z4.h
-; VBITS_GE_128-NEXT: stp q0, q1, [x0]
+; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z2.s
+; VBITS_GE_128-NEXT: movprfx z2, z7
+; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z6.s
+; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h
+; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h
+; VBITS_GE_128-NEXT: stp q1, q0, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: udiv_v16i16:
@@ -1070,14 +1120,19 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h
+; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s
-; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z2.h
-; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z0.h
+; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: udiv_v16i16:
@@ -1134,14 +1189,19 @@ define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT: uunpkhi z2.s, z1.h
-; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z2.h, p1, z2.h, z0.h
+; CHECK-NEXT: st1h { z2.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 9312711530e7..fb060f4dcae5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -18,13 +18,13 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0
-; VBITS_GE_128-NEXT: sunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
; VBITS_GE_128-NEXT: ret
@@ -97,30 +97,28 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: srem_v16i8:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT: sunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT: sunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT: sunpkhi z6.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: sshll v6.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: sshll v7.8h, v0.8b, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: sunpklo z3.h, z0.b
-; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT: sunpkhi z6.s, z4.h
-; VBITS_GE_128-NEXT: sunpkhi z7.s, z3.h
-; VBITS_GE_128-NEXT: sunpklo z4.s, z4.h
-; VBITS_GE_128-NEXT: sunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h
-; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h
-; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT: sshll2 v3.4s, v6.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v7.8h, #0
+; VBITS_GE_128-NEXT: sshll v6.4s, v6.4h, #0
+; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT: sshll v7.4s, v7.4h, #0
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
+; VBITS_GE_128-NEXT: movprfx z4, z7
+; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT: uzp1 v3.8h, v4.8h, v3.8h
+; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b
; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v16i8:
@@ -129,15 +127,20 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h
+; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: sunpkhi z4.s, z2.h
-; VBITS_GE_256-NEXT: sunpkhi z5.s, z3.h
; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: splice z4.h, p0, z4.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z2.b, z4.b, z4.b
; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
@@ -218,14 +221,19 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
-; CHECK-NEXT: sunpkhi z4.s, z2.h
-; CHECK-NEXT: sunpkhi z5.s, z3.h
+; CHECK-NEXT: sunpklo z4.s, z2.h
+; CHECK-NEXT: sunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
@@ -243,26 +251,42 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT: sunpkhi z2.h, z1.b
-; CHECK-NEXT: sunpkhi z3.h, z0.b
-; CHECK-NEXT: sunpklo z4.h, z1.b
-; CHECK-NEXT: sunpklo z5.h, z0.b
-; CHECK-NEXT: sunpkhi z6.s, z2.h
-; CHECK-NEXT: sunpkhi z7.s, z3.h
+; CHECK-NEXT: ptrue p2.h, vl64
+; CHECK-NEXT: sunpklo z2.h, z1.b
+; CHECK-NEXT: sunpklo z3.h, z0.b
+; CHECK-NEXT: sunpklo z5.s, z2.h
+; CHECK-NEXT: sunpklo z6.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdivr z5.s, p1/m, z5.s, z6.s
+; CHECK-NEXT: mov z6.d, z0.d
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s
-; CHECK-NEXT: sunpkhi z7.s, z4.h
+; CHECK-NEXT: ext z4.b, z4.b, z1.b, #128
; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: sunpkhi z3.s, z5.h
+; CHECK-NEXT: ext z6.b, z6.b, z0.b, #128
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: sunpklo z3.h, z4.b
+; CHECK-NEXT: sunpklo z4.h, z6.b
+; CHECK-NEXT: splice z5.h, p2, z5.h, z2.h
+; CHECK-NEXT: sunpklo z2.s, z3.h
+; CHECK-NEXT: sunpklo z6.s, z4.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #128
+; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sunpklo z4.s, z4.h
-; CHECK-NEXT: sunpklo z5.s, z5.h
-; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z7.s
-; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z6.s
+; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT: splice z2.h, p2, z2.h, z3.h
+; CHECK-NEXT: ptrue p1.b, vl128
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: splice z4.b, p1, z4.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
@@ -323,19 +347,16 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: srem_v8i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT: sunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT: sunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: sshll v4.4s, v1.4h, #0
; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: sunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT: sshll v5.4s, v0.4h, #0
; VBITS_GE_128-NEXT: movprfx z3, z5
; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h
; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v8i16:
@@ -370,26 +391,26 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @srem_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: srem_v16i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ldp q0, q1, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: sunpkhi z17.s, z2.h
-; VBITS_GE_128-NEXT: ldp q3, q1, [x1]
-; VBITS_GE_128-NEXT: sunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT: sunpklo z7.s, z0.h
-; VBITS_GE_128-NEXT: sunpkhi z16.s, z3.h
-; VBITS_GE_128-NEXT: sdivr z16.s, p0/m, z16.s, z17.s
-; VBITS_GE_128-NEXT: sunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT: sunpklo z6.s, z1.h
-; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT: sunpklo z5.s, z3.h
-; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT: sunpklo z7.s, z2.h
-; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z7.s
-; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h
-; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h
-; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v3.8h
-; VBITS_GE_128-NEXT: mls v0.8h, v4.8h, v1.8h
-; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: sshll v7.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: sshll2 v4.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: sshll2 v6.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: sshll v16.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: sshll2 v17.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT: sshll v6.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
+; VBITS_GE_128-NEXT: sshll v16.4s, v3.4h, #0
+; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z17.s
+; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h
+; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z16.s
+; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h
+; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h
+; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT: stp q2, q3, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v16i16:
@@ -398,14 +419,20 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: sunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT: sunpkhi z3.s, z0.h
-; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
-; VBITS_GE_256-NEXT: sunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT: mov z3.d, z1.d
+; VBITS_GE_256-NEXT: mov z4.d, z0.d
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16
+; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: sunpklo z5.s, z0.h
-; VBITS_GE_256-NEXT: movprfx z3, z5
-; VBITS_GE_256-NEXT: sdiv z3.s, p1/m, z3.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h
+; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z5.s
+; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h
; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -479,14 +506,20 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: sunpkhi z3.s, z0.h
-; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: sunpklo z4.s, z1.h
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128
+; CHECK-NEXT: sunpklo z2.s, z1.h
; CHECK-NEXT: sunpklo z5.s, z0.h
-; CHECK-NEXT: movprfx z3, z5
-; CHECK-NEXT: sdiv z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sunpklo z4.s, z4.h
+; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
@@ -805,13 +838,13 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0
-; VBITS_GE_128-NEXT: uunpkhi z4.s, z2.h
-; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z4.h
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h
; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b
; VBITS_GE_128-NEXT: ret
@@ -884,30 +917,28 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_128-LABEL: urem_v16i8:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_128-NEXT: uunpkhi z2.h, z1.b
-; VBITS_GE_128-NEXT: uunpkhi z3.h, z0.b
+; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z2.h
-; VBITS_GE_128-NEXT: uunpkhi z6.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z2.s, z2.h
-; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z4.h, z1.b
+; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT: ushll v6.8h, v1.8b, #0
+; VBITS_GE_128-NEXT: ushll v7.8h, v0.8b, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: uunpklo z3.h, z0.b
-; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT: uunpkhi z6.s, z4.h
-; VBITS_GE_128-NEXT: uunpkhi z7.s, z3.h
-; VBITS_GE_128-NEXT: uunpklo z4.s, z4.h
-; VBITS_GE_128-NEXT: uunpklo z3.s, z3.h
-; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT: uzp1 z2.h, z2.h, z5.h
-; VBITS_GE_128-NEXT: uzp1 z3.h, z3.h, z6.h
-; VBITS_GE_128-NEXT: uzp1 z2.b, z3.b, z2.b
+; VBITS_GE_128-NEXT: ushll2 v3.4s, v6.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v7.8h, #0
+; VBITS_GE_128-NEXT: ushll v6.4s, v6.4h, #0
+; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT: ushll v7.4s, v7.4h, #0
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h
+; VBITS_GE_128-NEXT: movprfx z4, z7
+; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z6.s
+; VBITS_GE_128-NEXT: uzp1 v3.8h, v4.8h, v3.8h
+; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b
; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v16i8:
@@ -916,15 +947,20 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT: uunpklo z4.s, z2.h
+; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h
+; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
-; VBITS_GE_256-NEXT: uunpkhi z4.s, z2.h
-; VBITS_GE_256-NEXT: uunpkhi z5.s, z3.h
; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h
; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z4.h
-; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT: uzp1 z4.h, z4.h, z4.h
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: ptrue p0.h, vl8
+; VBITS_GE_256-NEXT: splice z4.h, p0, z4.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z2.b, z4.b, z4.b
; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b
; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_256-NEXT: ret
@@ -1005,14 +1041,19 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
-; CHECK-NEXT: uunpkhi z4.s, z2.h
-; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: uunpklo z4.s, z2.h
+; CHECK-NEXT: uunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
@@ -1030,26 +1071,42 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT: uunpkhi z2.h, z1.b
-; CHECK-NEXT: uunpkhi z3.h, z0.b
-; CHECK-NEXT: uunpklo z4.h, z1.b
-; CHECK-NEXT: uunpklo z5.h, z0.b
-; CHECK-NEXT: uunpkhi z6.s, z2.h
-; CHECK-NEXT: uunpkhi z7.s, z3.h
+; CHECK-NEXT: ptrue p2.h, vl64
+; CHECK-NEXT: uunpklo z2.h, z1.b
+; CHECK-NEXT: uunpklo z3.h, z0.b
+; CHECK-NEXT: uunpklo z5.s, z2.h
+; CHECK-NEXT: uunpklo z6.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udivr z5.s, p1/m, z5.s, z6.s
+; CHECK-NEXT: mov z6.d, z0.d
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s
-; CHECK-NEXT: uunpkhi z7.s, z4.h
+; CHECK-NEXT: ext z4.b, z4.b, z1.b, #128
; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: uunpkhi z3.s, z5.h
+; CHECK-NEXT: ext z6.b, z6.b, z0.b, #128
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uunpklo z3.h, z4.b
+; CHECK-NEXT: uunpklo z4.h, z6.b
+; CHECK-NEXT: splice z5.h, p2, z5.h, z2.h
+; CHECK-NEXT: uunpklo z2.s, z3.h
+; CHECK-NEXT: uunpklo z6.s, z4.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #128
+; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: uunpklo z4.s, z4.h
-; CHECK-NEXT: uunpklo z5.s, z5.h
-; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z7.s
-; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
-; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z6.s
+; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT: splice z2.h, p2, z2.h, z3.h
+; CHECK-NEXT: ptrue p1.b, vl128
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: splice z4.b, p1, z4.b, z2.b
+; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
@@ -1110,19 +1167,16 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; VBITS_GE_128-LABEL: urem_v8i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: // kill: def $q1 killed $q1 def $z1
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_GE_128-NEXT: uunpkhi z3.s, z0.h
-; VBITS_GE_128-NEXT: uunpklo z4.s, z1.h
+; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: ushll v4.4s, v1.4h, #0
; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT: uunpklo z5.s, z0.h
+; VBITS_GE_128-NEXT: ushll v5.4s, v0.4h, #0
; VBITS_GE_128-NEXT: movprfx z3, z5
; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; VBITS_GE_128-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h
; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h
-; VBITS_GE_128-NEXT: // kill: def $q0 killed $q0 killed $z0
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v8i16:
@@ -1157,26 +1211,26 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @urem_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: urem_v16i16:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: ldp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ldp q0, q1, [x1]
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
-; VBITS_GE_128-NEXT: uunpkhi z17.s, z2.h
-; VBITS_GE_128-NEXT: ldp q3, q1, [x1]
-; VBITS_GE_128-NEXT: uunpkhi z5.s, z0.h
-; VBITS_GE_128-NEXT: uunpklo z7.s, z0.h
-; VBITS_GE_128-NEXT: uunpkhi z16.s, z3.h
-; VBITS_GE_128-NEXT: udivr z16.s, p0/m, z16.s, z17.s
-; VBITS_GE_128-NEXT: uunpkhi z4.s, z1.h
-; VBITS_GE_128-NEXT: uunpklo z6.s, z1.h
-; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT: uunpklo z5.s, z3.h
-; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s
-; VBITS_GE_128-NEXT: uunpklo z7.s, z2.h
-; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z7.s
-; VBITS_GE_128-NEXT: uzp1 z4.h, z6.h, z4.h
-; VBITS_GE_128-NEXT: uzp1 z5.h, z5.h, z16.h
-; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v3.8h
-; VBITS_GE_128-NEXT: mls v0.8h, v4.8h, v1.8h
-; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT: ushll v7.4s, v0.4h, #0
+; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
+; VBITS_GE_128-NEXT: ushll2 v4.4s, v1.8h, #0
+; VBITS_GE_128-NEXT: ushll2 v6.4s, v2.8h, #0
+; VBITS_GE_128-NEXT: ushll v16.4s, v2.4h, #0
+; VBITS_GE_128-NEXT: ushll2 v17.4s, v3.8h, #0
+; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT: ushll v6.4s, v1.4h, #0
+; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s
+; VBITS_GE_128-NEXT: ushll v16.4s, v3.4h, #0
+; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z17.s
+; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h
+; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z16.s
+; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h
+; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h
+; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h
+; VBITS_GE_128-NEXT: stp q2, q3, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v16i16:
@@ -1185,14 +1239,20 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-NEXT: ptrue p1.s, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT: uunpkhi z2.s, z1.h
-; VBITS_GE_256-NEXT: uunpkhi z3.s, z0.h
-; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s
-; VBITS_GE_256-NEXT: uunpklo z4.s, z1.h
+; VBITS_GE_256-NEXT: mov z3.d, z1.d
+; VBITS_GE_256-NEXT: mov z4.d, z0.d
+; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16
+; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h
; VBITS_GE_256-NEXT: uunpklo z5.s, z0.h
-; VBITS_GE_256-NEXT: movprfx z3, z5
-; VBITS_GE_256-NEXT: udiv z3.s, p1/m, z3.s, z4.s
-; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z2.h
+; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h
+; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z5.s
+; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h
+; VBITS_GE_256-NEXT: ptrue p1.h, vl8
+; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h
; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
@@ -1266,14 +1326,20 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-NEXT: ptrue p1.s, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT: uunpkhi z2.s, z1.h
-; CHECK-NEXT: uunpkhi z3.s, z0.h
-; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT: uunpklo z4.s, z1.h
+; CHECK-NEXT: mov z3.d, z1.d
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128
+; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpklo z5.s, z0.h
-; CHECK-NEXT: movprfx z3, z5
-; CHECK-NEXT: udiv z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: uunpklo z4.s, z4.h
+; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: ptrue p1.h, vl64
+; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index b22545526faf..b6547cef81b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -36,14 +36,19 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: sunpklo z1.h, z1.b
; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z2.b, z2.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <8 x i8> %op1, %op2
@@ -55,26 +60,43 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: sunpkhi z2.h, z1.b
-; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: sunpklo z2.h, z2.b
+; CHECK-NEXT: sunpklo z3.h, z3.b
+; CHECK-NEXT: sunpklo z4.s, z2.h
+; CHECK-NEXT: sunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: sunpklo z1.h, z1.b
-; CHECK-NEXT: sunpkhi z4.s, z2.h
-; CHECK-NEXT: sunpkhi z5.s, z3.h
+; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: sunpklo z5.s, z1.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: sunpkhi z3.s, z1.h
-; CHECK-NEXT: sunpkhi z5.s, z0.h
+; CHECK-NEXT: sunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z1.s, z1.h
; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT: movprfx z2, z3
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z5.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: splice z0.b, p0, z0.b, z2.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <16 x i8> %op1, %op2
@@ -84,48 +106,76 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: sdiv_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q3, q0, [x1]
+; CHECK-NEXT: ldp q0, q2, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: ldp q2, q1, [x0]
-; CHECK-NEXT: sunpkhi z4.h, z0.b
-; CHECK-NEXT: sunpklo z0.h, z0.b
-; CHECK-NEXT: sunpkhi z6.s, z4.h
-; CHECK-NEXT: sunpklo z4.s, z4.h
-; CHECK-NEXT: sunpkhi z16.s, z0.h
-; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: sunpkhi z5.h, z1.b
-; CHECK-NEXT: sunpklo z1.h, z1.b
-; CHECK-NEXT: sunpkhi z7.s, z5.h
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: ptrue p2.b, vl8
+; CHECK-NEXT: ldp q1, q3, [x1]
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: sunpklo z2.h, z2.b
+; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT: sunpklo z5.h, z5.b
+; CHECK-NEXT: sunpklo z7.s, z5.h
+; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: sunpklo z5.s, z5.h
+; CHECK-NEXT: mov z4.d, z3.d
+; CHECK-NEXT: sunpklo z3.h, z3.b
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: sunpklo z4.h, z4.b
+; CHECK-NEXT: sunpklo z6.s, z4.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: sunpklo z4.s, z4.h
; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: sunpkhi z5.s, z1.h
-; CHECK-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h
-; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: sunpkhi z1.h, z3.b
-; CHECK-NEXT: sunpkhi z6.h, z2.b
-; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z16.s
-; CHECK-NEXT: sunpkhi z7.s, z1.h
-; CHECK-NEXT: sunpkhi z16.s, z6.h
-; CHECK-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEXT: sunpklo z6.s, z6.h
-; CHECK-NEXT: sunpklo z3.h, z3.b
-; CHECK-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
-; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z6.s
-; CHECK-NEXT: sunpkhi z6.s, z3.h
-; CHECK-NEXT: sunpkhi z16.s, z2.h
+; CHECK-NEXT: sunpklo z7.s, z3.h
+; CHECK-NEXT: sunpklo z5.s, z2.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: splice z6.h, p1, z6.h, z4.h
; CHECK-NEXT: sunpklo z2.s, z2.h
-; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z16.s
+; CHECK-NEXT: uzp1 z4.b, z6.b, z6.b
+; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z7.s
; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h
-; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b
-; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: sunpklo z3.h, z1.b
+; CHECK-NEXT: sunpklo z6.h, z0.b
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: sunpklo z1.h, z1.b
+; CHECK-NEXT: sunpklo z0.h, z0.b
+; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT: sunpklo z2.s, z1.h
+; CHECK-NEXT: sunpklo z7.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: sunpklo z2.s, z3.h
+; CHECK-NEXT: sunpklo z7.s, z6.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEXT: sunpklo z6.s, z6.h
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z6.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b
+; CHECK-NEXT: splice z2.b, p2, z2.b, z4.b
+; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
@@ -172,14 +222,21 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: sunpkhi z3.s, z0.h
; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sunpklo z0.s, z0.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = sdiv <8 x i16> %op1, %op2
@@ -189,24 +246,34 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: sdiv_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q1, [x1]
+; CHECK-NEXT: ldp q3, q0, [x1]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: sunpkhi z6.s, z0.h
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: sunpklo z6.s, z3.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: sunpklo z4.s, z0.h
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.s, z0.h
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: sunpkhi z4.s, z1.h
+; CHECK-NEXT: sunpklo z7.s, z1.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEXT: sunpkhi z5.s, z2.h
+; CHECK-NEXT: sunpklo z5.s, z2.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: sunpkhi z5.s, z3.h
-; CHECK-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h
-; CHECK-NEXT: stp q0, q1, [x0]
+; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: sunpklo z2.s, z3.h
+; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: movprfx z2, z7
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h
+; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h
+; CHECK-NEXT: stp q2, q3, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
@@ -331,14 +398,19 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: uunpklo z1.h, z1.b
; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z2.s, z1.h
-; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z2.b, z2.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = udiv <8 x i8> %op1, %op2
@@ -350,26 +422,43 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: uunpkhi z2.h, z1.b
-; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEXT: uunpklo z3.h, z3.b
+; CHECK-NEXT: uunpklo z4.s, z2.h
+; CHECK-NEXT: uunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: uunpkhi z4.s, z2.h
-; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: uunpklo z5.s, z1.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: uunpkhi z3.s, z1.h
-; CHECK-NEXT: uunpkhi z5.s, z0.h
+; CHECK-NEXT: uunpklo z3.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: uunpklo z1.s, z1.h
; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h
-; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT: movprfx z2, z3
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z5.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: splice z0.b, p0, z0.b, z2.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = udiv <16 x i8> %op1, %op2
@@ -379,48 +468,76 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: udiv_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q3, q0, [x1]
+; CHECK-NEXT: ldp q0, q2, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: ldp q2, q1, [x0]
-; CHECK-NEXT: uunpkhi z4.h, z0.b
-; CHECK-NEXT: uunpklo z0.h, z0.b
-; CHECK-NEXT: uunpkhi z6.s, z4.h
-; CHECK-NEXT: uunpklo z4.s, z4.h
-; CHECK-NEXT: uunpkhi z16.s, z0.h
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpkhi z5.h, z1.b
-; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: uunpkhi z7.s, z5.h
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: ptrue p2.b, vl8
+; CHECK-NEXT: ldp q1, q3, [x1]
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT: uunpklo z5.h, z5.b
+; CHECK-NEXT: uunpklo z7.s, z5.h
+; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: uunpklo z5.s, z5.h
+; CHECK-NEXT: mov z4.d, z3.d
+; CHECK-NEXT: uunpklo z3.h, z3.b
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: uunpklo z4.h, z4.b
+; CHECK-NEXT: uunpklo z6.s, z4.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: uunpklo z4.s, z4.h
; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: uunpkhi z5.s, z1.h
-; CHECK-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEXT: uzp1 z4.h, z4.h, z6.h
-; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uunpkhi z1.h, z3.b
-; CHECK-NEXT: uunpkhi z6.h, z2.b
-; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z16.s
-; CHECK-NEXT: uunpkhi z7.s, z1.h
-; CHECK-NEXT: uunpkhi z16.s, z6.h
-; CHECK-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEXT: uunpklo z6.s, z6.h
-; CHECK-NEXT: uunpklo z3.h, z3.b
-; CHECK-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s
-; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z6.s
-; CHECK-NEXT: uunpkhi z6.s, z3.h
-; CHECK-NEXT: uunpkhi z16.s, z2.h
+; CHECK-NEXT: uunpklo z7.s, z3.h
+; CHECK-NEXT: uunpklo z5.s, z2.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: splice z6.h, p1, z6.h, z4.h
; CHECK-NEXT: uunpklo z2.s, z2.h
-; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z16.s
+; CHECK-NEXT: uzp1 z4.b, z6.b, z6.b
+; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z7.s
; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: uzp1 z1.h, z1.h, z7.h
-; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h
-; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT: uzp1 z1.b, z2.b, z1.b
-; CHECK-NEXT: uzp1 z0.b, z0.b, z4.b
-; CHECK-NEXT: stp q1, q0, [x0]
+; CHECK-NEXT: uunpklo z3.h, z1.b
+; CHECK-NEXT: uunpklo z6.h, z0.b
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uunpklo z1.h, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpklo z7.s, z0.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uunpklo z2.s, z3.h
+; CHECK-NEXT: uunpklo z7.s, z6.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: uunpklo z6.s, z6.h
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z7.s
+; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z6.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b
+; CHECK-NEXT: splice z2.b, p2, z2.b, z4.b
+; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
@@ -465,14 +582,21 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: uunpkhi z2.s, z1.h
-; CHECK-NEXT: uunpkhi z3.s, z0.h
; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = udiv <8 x i16> %op1, %op2
@@ -482,24 +606,34 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: udiv_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q1, [x1]
+; CHECK-NEXT: ldp q3, q0, [x1]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: uunpkhi z6.s, z0.h
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: uunpklo z6.s, z3.h
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT: ldp q1, q2, [x0]
+; CHECK-NEXT: uunpklo z4.s, z0.h
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: uunpkhi z4.s, z1.h
+; CHECK-NEXT: uunpklo z7.s, z1.h
+; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
; CHECK-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEXT: uunpkhi z5.s, z2.h
+; CHECK-NEXT: uunpklo z5.s, z2.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: uunpkhi z5.s, z3.h
-; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z3.s
-; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT: uzp1 z0.h, z0.h, z5.h
-; CHECK-NEXT: uzp1 z1.h, z1.h, z4.h
-; CHECK-NEXT: stp q0, q1, [x0]
+; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT: uunpklo z2.s, z3.h
+; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: movprfx z2, z7
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h
+; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h
+; CHECK-NEXT: stp q2, q3, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index e8fe9c33b8fd..f0f74e277ca4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -36,16 +36,21 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: sunpklo z2.h, z1.b
; CHECK-NEXT: sunpklo z3.h, z0.b
+; CHECK-NEXT: sunpklo z4.s, z2.h
+; CHECK-NEXT: sunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: sunpkhi z4.s, z2.h
-; CHECK-NEXT: sunpkhi z5.s, z3.h
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h
; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -58,27 +63,44 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: sunpkhi z2.h, z1.b
-; CHECK-NEXT: sunpkhi z3.h, z0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT: sunpklo z2.h, z2.b
+; CHECK-NEXT: sunpklo z3.h, z3.b
+; CHECK-NEXT: sunpklo z5.s, z2.h
+; CHECK-NEXT: sunpklo z6.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: sunpkhi z5.s, z2.h
-; CHECK-NEXT: sunpkhi z6.s, z3.h
; CHECK-NEXT: sunpklo z2.s, z2.h
; CHECK-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEXT: sunpklo z4.h, z1.b
-; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: sunpklo z3.h, z0.b
; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT: sunpkhi z6.s, z4.h
-; CHECK-NEXT: sunpkhi z7.s, z3.h
+; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: sunpklo z4.h, z1.b
+; CHECK-NEXT: sunpklo z6.h, z0.b
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: sunpklo z3.s, z4.h
+; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT: sunpklo z2.s, z6.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: sunpklo z4.s, z4.h
-; CHECK-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT: sunpklo z6.s, z6.h
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: movprfx z3, z6
; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h
-; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: splice z2.b, p0, z2.b, z4.b
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
@@ -89,51 +111,81 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: srem_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q2, q0, [x0]
+; CHECK-NEXT: ldp q1, q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: ldp q3, q1, [x1]
-; CHECK-NEXT: sunpkhi z5.h, z0.b
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: ldp q3, q2, [x1]
+; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: sunpklo z7.h, z0.b
-; CHECK-NEXT: sunpkhi z17.s, z5.h
+; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT: sunpklo z5.h, z5.b
+; CHECK-NEXT: sunpklo z18.s, z5.h
+; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: sunpklo z5.s, z5.h
-; CHECK-NEXT: sunpkhi z4.h, z1.b
-; CHECK-NEXT: sunpklo z6.h, z1.b
-; CHECK-NEXT: sunpkhi z16.s, z4.h
+; CHECK-NEXT: mov z4.d, z2.d
+; CHECK-NEXT: sunpklo z6.h, z2.b
+; CHECK-NEXT: ext z4.b, z4.b, z2.b, #8
+; CHECK-NEXT: sunpklo z16.s, z6.h
+; CHECK-NEXT: sunpklo z4.h, z4.b
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: sunpklo z17.s, z4.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: sunpklo z4.s, z4.h
-; CHECK-NEXT: sunpkhi z18.s, z6.h
+; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: sunpkhi z5.s, z7.h
+; CHECK-NEXT: sunpklo z18.s, z7.h
+; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT: sunpklo z5.s, z6.h
+; CHECK-NEXT: splice z17.h, p1, z17.h, z4.h
+; CHECK-NEXT: sunpklo z4.s, z7.h
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z7.d, z1.d
+; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT: sunpklo z6.h, z6.b
+; CHECK-NEXT: sunpklo z7.h, z7.b
+; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: uzp1 z5.h, z16.h, z16.h
+; CHECK-NEXT: sunpklo z16.s, z6.h
+; CHECK-NEXT: sunpklo z18.s, z7.h
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
; CHECK-NEXT: sunpklo z6.s, z6.h
; CHECK-NEXT: sunpklo z7.s, z7.h
-; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z18.s
+; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h
-; CHECK-NEXT: sunpkhi z6.h, z3.b
-; CHECK-NEXT: sunpkhi z7.h, z2.b
-; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h
-; CHECK-NEXT: sunpkhi z16.s, z6.h
-; CHECK-NEXT: sunpkhi z17.s, z7.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h
+; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h
+; CHECK-NEXT: sunpklo z4.h, z3.b
+; CHECK-NEXT: sunpklo z6.h, z1.b
+; CHECK-NEXT: sunpklo z16.s, z4.h
+; CHECK-NEXT: sunpklo z18.s, z6.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: sunpklo z4.s, z4.h
; CHECK-NEXT: sunpklo z6.s, z6.h
-; CHECK-NEXT: sunpklo z7.s, z7.h
-; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT: sunpklo z7.h, z3.b
-; CHECK-NEXT: sunpklo z17.h, z2.b
-; CHECK-NEXT: sunpkhi z18.s, z7.h
-; CHECK-NEXT: sunpkhi z19.s, z17.h
-; CHECK-NEXT: sunpklo z7.s, z7.h
-; CHECK-NEXT: sunpklo z17.s, z17.h
-; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z19.s
-; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z17.s
-; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h
-; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: uzp1 z6.b, z7.b, z6.b
-; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT: mls z2.b, p0/m, z6.b, z3.b
-; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b
-; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: splice z16.h, p1, z16.h, z4.h
+; CHECK-NEXT: uzp1 z6.b, z17.b, z17.b
+; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: uzp1 z4.b, z7.b, z7.b
+; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b
+; CHECK-NEXT: ptrue p1.b, vl16
+; CHECK-NEXT: splice z7.b, p0, z7.b, z4.b
+; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b
+; CHECK-NEXT: mls z1.b, p1/m, z7.b, z3.b
+; CHECK-NEXT: mls z0.b, p1/m, z5.b, z2.b
+; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
@@ -165,17 +217,23 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: sunpkhi z2.s, z1.h
-; CHECK-NEXT: sunpkhi z3.s, z0.h
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sunpklo z3.s, z3.h
; CHECK-NEXT: sunpklo z4.s, z1.h
; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: sunpklo z5.s, z0.h
-; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: sunpklo z3.s, z0.h
; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h
; CHECK-NEXT: ptrue p0.h, vl8
-; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = srem <8 x i16> %op1, %op2
@@ -187,24 +245,38 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q2, q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: sunpkhi z17.s, z2.h
+; CHECK-NEXT: ptrue p1.h, vl8
+; CHECK-NEXT: mov z17.d, z2.d
+; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8
; CHECK-NEXT: ldp q3, q1, [x1]
-; CHECK-NEXT: sunpkhi z5.s, z0.h
+; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: sunpklo z7.s, z0.h
-; CHECK-NEXT: sunpkhi z16.s, z3.h
-; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT: sunpkhi z4.s, z1.h
+; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT: sunpklo z5.s, z5.h
+; CHECK-NEXT: mov z16.d, z3.d
+; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8
+; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: sunpklo z6.s, z1.h
-; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: sunpklo z5.s, z3.h
+; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8
; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT: sunpklo z7.s, z2.h
-; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z7.s
-; CHECK-NEXT: ptrue p0.h, vl8
-; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h
-; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h
-; CHECK-NEXT: mls z2.h, p0/m, z5.h, z3.h
-; CHECK-NEXT: mls z0.h, p0/m, z4.h, z1.h
+; CHECK-NEXT: sunpklo z4.s, z4.h
+; CHECK-NEXT: sunpklo z7.s, z16.h
+; CHECK-NEXT: sunpklo z16.s, z17.h
+; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: movprfx z5, z16
+; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z7.s
+; CHECK-NEXT: sunpklo z7.s, z3.h
+; CHECK-NEXT: sunpklo z16.s, z2.h
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h
+; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h
+; CHECK-NEXT: mls z2.h, p1/m, z7.h, z3.h
+; CHECK-NEXT: mls z0.h, p1/m, z5.h, z1.h
; CHECK-NEXT: stp q2, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
@@ -347,16 +419,21 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: uunpklo z2.h, z1.b
; CHECK-NEXT: uunpklo z3.h, z0.b
+; CHECK-NEXT: uunpklo z4.s, z2.h
+; CHECK-NEXT: uunpklo z5.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: uunpkhi z4.s, z2.h
-; CHECK-NEXT: uunpkhi z5.s, z3.h
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h
; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: uzp1 z2.h, z2.h, z4.h
-; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
@@ -369,27 +446,44 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT: uunpkhi z2.h, z1.b
-; CHECK-NEXT: uunpkhi z3.h, z0.b
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEXT: uunpklo z3.h, z3.b
+; CHECK-NEXT: uunpklo z5.s, z2.h
+; CHECK-NEXT: uunpklo z6.s, z3.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: uunpkhi z5.s, z2.h
-; CHECK-NEXT: uunpkhi z6.s, z3.h
; CHECK-NEXT: uunpklo z2.s, z2.h
; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: uunpklo z4.h, z1.b
-; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: uunpklo z3.h, z0.b
; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT: uunpkhi z6.s, z4.h
-; CHECK-NEXT: uunpkhi z7.s, z3.h
+; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: uunpklo z4.h, z1.b
+; CHECK-NEXT: uunpklo z6.h, z0.b
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uunpklo z3.s, z4.h
+; CHECK-NEXT: splice z5.h, p1, z5.h, z2.h
+; CHECK-NEXT: uunpklo z2.s, z6.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: uunpklo z4.s, z4.h
-; CHECK-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT: uunpklo z6.s, z6.h
+; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: movprfx z3, z6
; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h
-; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: uzp1 z4.b, z5.b, z5.b
+; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT: splice z2.b, p0, z2.b, z4.b
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b
; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
@@ -400,51 +494,81 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: urem_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q2, q0, [x0]
+; CHECK-NEXT: ldp q1, q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: ldp q3, q1, [x1]
-; CHECK-NEXT: uunpkhi z5.h, z0.b
+; CHECK-NEXT: ptrue p1.h, vl4
+; CHECK-NEXT: ldp q3, q2, [x1]
+; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: uunpklo z7.h, z0.b
-; CHECK-NEXT: uunpkhi z17.s, z5.h
+; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT: uunpklo z5.h, z5.b
+; CHECK-NEXT: uunpklo z18.s, z5.h
+; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: uunpklo z5.s, z5.h
-; CHECK-NEXT: uunpkhi z4.h, z1.b
-; CHECK-NEXT: uunpklo z6.h, z1.b
-; CHECK-NEXT: uunpkhi z16.s, z4.h
+; CHECK-NEXT: mov z4.d, z2.d
+; CHECK-NEXT: uunpklo z6.h, z2.b
+; CHECK-NEXT: ext z4.b, z4.b, z2.b, #8
+; CHECK-NEXT: uunpklo z16.s, z6.h
+; CHECK-NEXT: uunpklo z4.h, z4.b
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: uunpklo z17.s, z4.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: uunpklo z4.s, z4.h
-; CHECK-NEXT: uunpkhi z18.s, z6.h
+; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s
; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: uunpkhi z5.s, z7.h
+; CHECK-NEXT: uunpklo z18.s, z7.h
+; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT: uunpklo z5.s, z6.h
+; CHECK-NEXT: splice z17.h, p1, z17.h, z4.h
+; CHECK-NEXT: uunpklo z4.s, z7.h
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z7.d, z1.d
+; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT: uunpklo z6.h, z6.b
+; CHECK-NEXT: uunpklo z7.h, z7.b
+; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: uzp1 z5.h, z16.h, z16.h
+; CHECK-NEXT: uunpklo z16.s, z6.h
+; CHECK-NEXT: uunpklo z18.s, z7.h
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
; CHECK-NEXT: uunpklo z6.s, z6.h
; CHECK-NEXT: uunpklo z7.s, z7.h
-; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z18.s
+; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s
; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT: uzp1 z5.h, z6.h, z5.h
-; CHECK-NEXT: uunpkhi z6.h, z3.b
-; CHECK-NEXT: uunpkhi z7.h, z2.b
-; CHECK-NEXT: uzp1 z4.h, z4.h, z16.h
-; CHECK-NEXT: uunpkhi z16.s, z6.h
-; CHECK-NEXT: uunpkhi z17.s, z7.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h
+; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h
+; CHECK-NEXT: uunpklo z4.h, z3.b
+; CHECK-NEXT: uunpklo z6.h, z1.b
+; CHECK-NEXT: uunpklo z16.s, z4.h
+; CHECK-NEXT: uunpklo z18.s, z6.h
+; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT: uunpklo z4.s, z4.h
; CHECK-NEXT: uunpklo z6.s, z6.h
-; CHECK-NEXT: uunpklo z7.s, z7.h
-; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT: uunpklo z7.h, z3.b
-; CHECK-NEXT: uunpklo z17.h, z2.b
-; CHECK-NEXT: uunpkhi z18.s, z7.h
-; CHECK-NEXT: uunpkhi z19.s, z17.h
-; CHECK-NEXT: uunpklo z7.s, z7.h
-; CHECK-NEXT: uunpklo z17.s, z17.h
-; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z19.s
-; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z17.s
-; CHECK-NEXT: uzp1 z6.h, z6.h, z16.h
-; CHECK-NEXT: uzp1 z7.h, z7.h, z18.h
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: uzp1 z6.b, z7.b, z6.b
-; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b
-; CHECK-NEXT: mls z2.b, p0/m, z6.b, z3.b
-; CHECK-NEXT: mls z0.b, p0/m, z4.b, z1.b
-; CHECK-NEXT: stp q2, q0, [x0]
+; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: splice z16.h, p1, z16.h, z4.h
+; CHECK-NEXT: uzp1 z6.b, z17.b, z17.b
+; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b
+; CHECK-NEXT: ptrue p0.b, vl8
+; CHECK-NEXT: uzp1 z4.b, z7.b, z7.b
+; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b
+; CHECK-NEXT: ptrue p1.b, vl16
+; CHECK-NEXT: splice z7.b, p0, z7.b, z4.b
+; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b
+; CHECK-NEXT: mls z1.b, p1/m, z7.b, z3.b
+; CHECK-NEXT: mls z0.b, p1/m, z5.b, z2.b
+; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b
@@ -476,17 +600,23 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: uunpkhi z2.s, z1.h
-; CHECK-NEXT: uunpkhi z3.s, z0.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
; CHECK-NEXT: uunpklo z4.s, z1.h
; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT: uunpklo z5.s, z0.h
-; CHECK-NEXT: movprfx z3, z5
+; CHECK-NEXT: uunpklo z3.s, z0.h
; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h
; CHECK-NEXT: ptrue p0.h, vl8
-; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = urem <8 x i16> %op1, %op2
@@ -498,24 +628,38 @@ define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q2, q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl4
-; CHECK-NEXT: uunpkhi z17.s, z2.h
+; CHECK-NEXT: ptrue p1.h, vl8
+; CHECK-NEXT: mov z17.d, z2.d
+; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8
; CHECK-NEXT: ldp q3, q1, [x1]
-; CHECK-NEXT: uunpkhi z5.s, z0.h
+; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: uunpklo z7.s, z0.h
-; CHECK-NEXT: uunpkhi z16.s, z3.h
-; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z17.s
-; CHECK-NEXT: uunpkhi z4.s, z1.h
+; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT: uunpklo z5.s, z5.h
+; CHECK-NEXT: mov z16.d, z3.d
+; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8
+; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: uunpklo z6.s, z1.h
-; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT: uunpklo z5.s, z3.h
+; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8
; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT: uunpklo z7.s, z2.h
-; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z7.s
-; CHECK-NEXT: ptrue p0.h, vl8
-; CHECK-NEXT: uzp1 z5.h, z5.h, z16.h
-; CHECK-NEXT: uzp1 z4.h, z6.h, z4.h
-; CHECK-NEXT: mls z2.h, p0/m, z5.h, z3.h
-; CHECK-NEXT: mls z0.h, p0/m, z4.h, z1.h
+; CHECK-NEXT: uunpklo z4.s, z4.h
+; CHECK-NEXT: uunpklo z7.s, z16.h
+; CHECK-NEXT: uunpklo z16.s, z17.h
+; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT: movprfx z5, z16
+; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z7.s
+; CHECK-NEXT: uunpklo z7.s, z3.h
+; CHECK-NEXT: uunpklo z16.s, z2.h
+; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h
+; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h
+; CHECK-NEXT: mls z2.h, p1/m, z7.h, z3.h
+; CHECK-NEXT: mls z0.h, p1/m, z5.h, z1.h
; CHECK-NEXT: stp q2, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, <16 x i16>* %a
More information about the llvm-commits
mailing list