[llvm] 37fa99e - [SchedModels][CortexA55] Add ASIMD integer instructions
Pavel Kosov via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 17 02:43:35 PST 2022
Author: Pavel Kosov
Date: 2022-02-17T13:41:57+03:00
New Revision: 37fa99eda0f5e6d5b15f6bb726d3bcbeeed30c50
URL: https://github.com/llvm/llvm-project/commit/37fa99eda0f5e6d5b15f6bb726d3bcbeeed30c50
DIFF: https://github.com/llvm/llvm-project/commit/37fa99eda0f5e6d5b15f6bb726d3bcbeeed30c50.diff
LOG: [SchedModels][CortexA55] Add ASIMD integer instructions
Depends on D114642
Original review https://reviews.llvm.org/D112201
OS Laboratory. Huawei Russian Research Institute. Saint-Petersburg
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D117003
Added:
Modified:
llvm/lib/Target/AArch64/AArch64SchedA55.td
llvm/test/Analysis/CostModel/AArch64/vector-select.ll
llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
llvm/test/CodeGen/AArch64/active_lane_mask.ll
llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
llvm/test/CodeGen/AArch64/arm64-vhadd.ll
llvm/test/CodeGen/AArch64/cmp-select-sign.ll
llvm/test/CodeGen/AArch64/dag-numsignbits.ll
llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
llvm/test/CodeGen/AArch64/expand-vector-rot.ll
llvm/test/CodeGen/AArch64/f16-instructions.ll
llvm/test/CodeGen/AArch64/fcopysign.ll
llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
llvm/test/CodeGen/AArch64/minmax.ll
llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
llvm/test/CodeGen/AArch64/sat-add.ll
llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
llvm/test/CodeGen/AArch64/signbit-shift.ll
llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
llvm/test/CodeGen/AArch64/sinksplat.ll
llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
llvm/test/CodeGen/AArch64/usub_sat_vec.ll
llvm/test/CodeGen/AArch64/vec_cttz.ll
llvm/test/CodeGen/AArch64/vec_uaddo.ll
llvm/test/CodeGen/AArch64/vec_umulo.ll
llvm/test/CodeGen/AArch64/vecreduce-add.ll
llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
llvm/test/CodeGen/AArch64/vector-fcopysign.ll
llvm/test/CodeGen/AArch64/vselect-constants.ll
llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 009219ce3c54..3543ff3ddfc3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -149,8 +149,36 @@ def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; }
def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; }
def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; }
def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; }
-def : WriteRes<WriteVd, [CortexA55UnitFPALU]> { let Latency = 4; }
-def : WriteRes<WriteVq, [CortexA55UnitFPALU,CortexA55UnitFPALU]> { let Latency = 4; let BeginGroup = 1; }
+
+// NEON
+class CortexA55WriteVd<int n, ProcResourceKind res> : SchedWriteRes<[res]> {
+ let Latency = n;
+}
+class CortexA55WriteVq<int n, ProcResourceKind res> : SchedWriteRes<[res, res]> {
+ let Latency = n;
+ let BeginGroup = 1;
+}
+class CortexA55WriteVqL<int n, ProcResourceKind res> : SchedWriteRes<[res, res, res, res]> {
+ let Latency = n;
+ let BeginGroup = 1;
+}
+def CortexA55WriteDotScVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteDotVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaLVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaIxVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteMlaVd_4 : CortexA55WriteVd<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_4 : CortexA55WriteVq<4, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_3 : CortexA55WriteVd<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_3 : CortexA55WriteVq<3, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_2 : CortexA55WriteVd<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_2 : CortexA55WriteVq<2, CortexA55UnitFPALU>;
+def CortexA55WriteAluVd_1 : CortexA55WriteVd<1, CortexA55UnitFPALU>;
+def CortexA55WriteAluVq_1 : CortexA55WriteVq<1, CortexA55UnitFPALU>;
+def CortexA55WriteAluVqL_4 : CortexA55WriteVqL<4, CortexA55UnitFPALU>;
+def : SchedAlias<WriteVd, CortexA55WriteVd<4, CortexA55UnitFPALU>>;
+def : SchedAlias<WriteVq, CortexA55WriteVq<4, CortexA55UnitFPALU>>;
// FP ALU specific new schedwrite definitions
def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;}
@@ -229,6 +257,13 @@ def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
WriteID32,WriteID64,
WriteIM32,WriteIM64]>;
+// NEON ALU/MAC forwarding paths
+def CortexA55ReadMla : SchedReadAdvance<3, [CortexA55WriteMlaVd_4, CortexA55WriteMlaVq_4]>;
+def CortexA55ReadMlaIx : SchedReadAdvance<3, [CortexA55WriteMlaIxVq_4]>;
+def CortexA55ReadMlaL : SchedReadAdvance<3, [CortexA55WriteMlaLVq_4]>;
+def CortexA55ReadDot : SchedReadAdvance<3, [CortexA55WriteDotVd_4, CortexA55WriteDotVq_4]>;
+def CortexA55ReadDotSc : SchedReadAdvance<3, [CortexA55WriteDotScVq_4]>;
+
//===----------------------------------------------------------------------===//
// Subtarget-specific InstRWs.
@@ -358,4 +393,99 @@ def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>;
def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+// 4.15. Advanced SIMD integer instructions
+// ASIMD absolute
diff
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
+// ASIMD absolute
diff accum
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "[SU]ABAL?v")>;
+// ASIMD absolute
diff long
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]ABDLv")>;
+// ASIMD arith #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
+ "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
+ "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
+// ASIMD arith #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+ "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
+ "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
+ "ADDPv(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+ "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
+ "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
+ "ADDPv(16i8|2i64|4i32|8i16)$")>;
+// ASIMD arith #3
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "SADDLv", "UADDLv", "SADDWv",
+ "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
+// ASIMD arith #5
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "RADDHNv", "RSUBHNv")>;
+// ASIMD arith, reduce
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "ADDVv", "SADDLVv", "UADDLVv")>;
+// ASIMD compare #1
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
+// ASIMD compare #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "CMTSTv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "CMTSTv(2i64|4i32|8i16|16i8)")>;
+// ASIMD logical $1
+def : InstRW<[CortexA55WriteAluVd_1], (instregex "(AND|EOR|NOT|ORN)v8i8",
+ "(ORR|BIC)v(2i32|4i16|8i8)$", "MVNIv(2i|2s|4i16)")>;
+def : InstRW<[CortexA55WriteAluVq_1], (instregex "(AND|EOR|NOT|ORN)v16i8",
+ "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
+// ASIMD max/min, basic
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
+// SIMD max/min, reduce
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU](MAX|MIN)Vv")>;
+// ASIMD multiply, by element
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "MULv(2i32|4i16|4i32|8i16)_indexed$",
+ "SQR?DMULHv(1i16|1i32|2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply
+def : InstRW<[CortexA55WriteAluVd_3], (instrs PMULv8i8)>;
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULv16i8)>;
+// ASIMD multiply accumulate
+def : InstRW<[CortexA55WriteMlaVd_4, CortexA55ReadMla], (instregex "ML[AS]v(2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA55WriteMlaVq_4, CortexA55ReadMla], (instregex "ML[AS]v(16i8|4i32|8i16)$")>;
+def : InstRW<[CortexA55WriteMlaIxVq_4, CortexA55ReadMlaIx], (instregex "ML[AS]v(2i32|4i16|4i32|8i16)_indexed$")>;
+// ASIMD multiply accumulate half
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQRDML[AS]H[vi]")>;
+// ASIMD multiply accumulate long
+def : InstRW<[CortexA55WriteMlaLVq_4, CortexA55ReadMlaL], (instregex "[SU]ML[AS]Lv")>;
+// ASIMD multiply accumulate long #2
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "SQDML[AS]L[iv]")>;
+// ASIMD dot product
+def : InstRW<[CortexA55WriteDotVd_4, CortexA55ReadDot], (instregex "[SU]DOTv8i8")>;
+def : InstRW<[CortexA55WriteDotVq_4, CortexA55ReadDot], (instregex "[SU]DOTv16i8")>;
+// ASIMD dot product, by scalar
+def : InstRW<[CortexA55WriteDotScVq_4, CortexA55ReadDotSc], (instregex "[SU]DOTlanev")>;
+// ASIMD multiply long
+def : InstRW<[CortexA55WriteAluVq_4], (instregex "[SU]MULLv", "SQDMULL[iv]")>;
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[CortexA55WriteAluVq_3], (instrs PMULLv8i8, PMULLv16i8)>;
+// ASIMD pairwise add and accumulate
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "[SU]ADALPv")>;
+// ASIMD shift accumulate
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
+// ASIMD shift accumulate #2
+def : InstRW<[CortexA55WriteAluVqL_4], (instregex "[SU]RSRA[vd]")>;
+// ASIMD shift by immed
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "SHLd$", "SHLv",
+ "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
+// ASIMD shift by immed
+// SXTL and UXTL are aliases for SHLL
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[US]?SHLLv")>;
+// ASIMD shift by immed #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHR(d|v2i32|v4i16|v8i8)",
+ "RSHRNv(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHRv(16i8|2i64|4i32|8i16)",
+ "RSHRNv(16i8|4i32|8i16)")>;
+// ASIMD shift by register
+def : InstRW<[CortexA55WriteAluVd_2], (instregex "[SU]SHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_2], (instregex "[SU]SHLv(2i64|4i32|8i16|16i8)")>;
+// ASIMD shift by register #2
+def : InstRW<[CortexA55WriteAluVd_3], (instregex "[SU]RSHLv(1i64|2i32|4i16|8i8)")>;
+def : InstRW<[CortexA55WriteAluVq_3], (instregex "[SU]RSHLv(2i64|4i32|8i16|16i8)")>;
+
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index e2d718c62d88..d43b82a8ea13 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -121,11 +121,11 @@ define <2 x i64> @v2i64_select_sle(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
; CODE-LABEL: v3i64_select_sle
; CODE: bb.0
; CODE: mov
-; CODE: ldr
; CODE: mov
; CODE: mov
; CODE: cmge
; CODE: cmge
+; CODE: ldr
; CODE: bif
; CODE: bif
; CODE: ext
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index 80222e5d5b63..da06d82f2430 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -35,11 +35,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI1_0
+; SDAG-NEXT: adrp x9, .LCPI1_1
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; SDAG-NEXT: adrp x8, .LCPI1_1
-; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; SDAG-NEXT: adrp x8, .LCPI1_2
+; SDAG-NEXT: ldr q2, [x9, :lo12:.LCPI1_1]
+; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v3.4s, v1.8h, v2.8h
; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
@@ -48,41 +48,41 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_3]
; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
-; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
+; SDAG-NEXT: ushl v0.8h, v0.8h, v2.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI1_4
-; GISEL-NEXT: adrp x10, .LCPI1_0
-; GISEL-NEXT: adrp x9, .LCPI1_1
+; GISEL-NEXT: adrp x9, .LCPI1_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_4]
; GISEL-NEXT: adrp x8, .LCPI1_3
-; GISEL-NEXT: ldr q5, [x10, :lo12:.LCPI1_0]
-; GISEL-NEXT: ldr q6, [x9, :lo12:.LCPI1_1]
+; GISEL-NEXT: ldr q5, [x9, :lo12:.LCPI1_0]
; GISEL-NEXT: neg v1.8h, v1.8h
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_3]
; GISEL-NEXT: adrp x8, .LCPI1_2
; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h
; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h
; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
-; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_2]
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
; GISEL-NEXT: adrp x8, .LCPI1_5
-; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
+; GISEL-NEXT: sub v3.8h, v0.8h, v1.8h
+; GISEL-NEXT: umull2 v4.4s, v3.8h, v2.8h
+; GISEL-NEXT: umull v2.4s, v3.4h, v2.4h
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_5]
+; GISEL-NEXT: adrp x8, .LCPI1_1
; GISEL-NEXT: cmeq v3.8h, v3.8h, v5.8h
; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h
-; GISEL-NEXT: neg v4.8h, v6.8h
+; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI1_1]
+; GISEL-NEXT: shl v3.8h, v3.8h, #15
; GISEL-NEXT: add v1.8h, v2.8h, v1.8h
-; GISEL-NEXT: shl v2.8h, v3.8h, #15
-; GISEL-NEXT: ushl v1.8h, v1.8h, v4.8h
-; GISEL-NEXT: sshr v2.8h, v2.8h, #15
+; GISEL-NEXT: neg v2.8h, v4.8h
+; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT: sshr v2.8h, v3.8h, #15
; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768>
@@ -93,15 +93,15 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform2:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI2_0
+; SDAG-NEXT: adrp x9, .LCPI2_1
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
-; SDAG-NEXT: adrp x8, .LCPI2_1
-; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
; SDAG-NEXT: adrp x8, .LCPI2_2
-; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: ldr q2, [x9, :lo12:.LCPI2_1]
+; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
+; SDAG-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
-; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
@@ -112,21 +112,21 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; GISEL-NEXT: adrp x10, .LCPI2_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_3]
; GISEL-NEXT: adrp x8, .LCPI2_2
-; GISEL-NEXT: ldr q3, [x9, :lo12:.LCPI2_4]
; GISEL-NEXT: ldr q4, [x10, :lo12:.LCPI2_0]
; GISEL-NEXT: neg v1.8h, v1.8h
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_2]
; GISEL-NEXT: adrp x8, .LCPI2_1
-; GISEL-NEXT: cmeq v3.8h, v3.8h, v4.8h
; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h
-; GISEL-NEXT: shl v3.8h, v3.8h, #15
-; GISEL-NEXT: umull2 v5.4s, v1.8h, v2.8h
+; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h
+; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI2_1]
; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
-; GISEL-NEXT: neg v2.8h, v2.8h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v5.8h
-; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sshr v2.8h, v3.8h, #15
+; GISEL-NEXT: ldr q2, [x9, :lo12:.LCPI2_4]
+; GISEL-NEXT: cmeq v2.8h, v2.8h, v4.8h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
+; GISEL-NEXT: neg v3.8h, v5.8h
+; GISEL-NEXT: shl v2.8h, v2.8h, #15
+; GISEL-NEXT: ushl v1.8h, v1.8h, v3.8h
+; GISEL-NEXT: sshr v2.8h, v2.8h, #15
; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -151,21 +151,21 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; GISEL-LABEL: combine_vec_udiv_nonuniform3:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI3_2
-; GISEL-NEXT: adrp x10, .LCPI3_0
-; GISEL-NEXT: adrp x9, .LCPI3_1
+; GISEL-NEXT: adrp x9, .LCPI3_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_2]
; GISEL-NEXT: adrp x8, .LCPI3_3
-; GISEL-NEXT: ldr q3, [x10, :lo12:.LCPI3_0]
-; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI3_1]
+; GISEL-NEXT: ldr q3, [x9, :lo12:.LCPI3_0]
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_3]
+; GISEL-NEXT: adrp x8, .LCPI3_1
; GISEL-NEXT: cmeq v2.8h, v2.8h, v3.8h
-; GISEL-NEXT: sub v5.8h, v0.8h, v1.8h
-; GISEL-NEXT: neg v3.8h, v4.8h
+; GISEL-NEXT: sub v4.8h, v0.8h, v1.8h
+; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI3_1]
; GISEL-NEXT: shl v2.8h, v2.8h, #15
-; GISEL-NEXT: usra v1.8h, v5.8h, #1
+; GISEL-NEXT: usra v1.8h, v4.8h, #1
+; GISEL-NEXT: neg v3.8h, v3.8h
; GISEL-NEXT: sshr v2.8h, v2.8h, #15
; GISEL-NEXT: ushl v1.8h, v1.8h, v3.8h
; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
@@ -178,41 +178,41 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform4:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI4_0
-; SDAG-NEXT: adrp x9, .LCPI4_3
+; SDAG-NEXT: adrp x9, .LCPI4_2
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
; SDAG-NEXT: adrp x8, .LCPI4_1
-; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_3]
+; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_2]
; SDAG-NEXT: umull2 v2.8h, v0.16b, v1.16b
; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b
-; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
-; SDAG-NEXT: adrp x8, .LCPI4_2
+; SDAG-NEXT: adrp x8, .LCPI4_3
; SDAG-NEXT: ushl v1.16b, v1.16b, v2.16b
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_2]
-; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_3]
+; SDAG-NEXT: and v1.16b, v1.16b, v3.16b
+; SDAG-NEXT: and v0.16b, v0.16b, v2.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform4:
; GISEL: // %bb.0:
+; GISEL-NEXT: adrp x8, .LCPI4_2
+; GISEL-NEXT: adrp x9, .LCPI4_0
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_2]
; GISEL-NEXT: adrp x8, .LCPI4_3
-; GISEL-NEXT: adrp x9, .LCPI4_2
-; GISEL-NEXT: adrp x10, .LCPI4_1
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_3]
-; GISEL-NEXT: adrp x8, .LCPI4_0
-; GISEL-NEXT: ldr q2, [x9, :lo12:.LCPI4_2]
-; GISEL-NEXT: ldr q3, [x10, :lo12:.LCPI4_1]
-; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_0]
-; GISEL-NEXT: umull2 v5.8h, v0.16b, v2.16b
-; GISEL-NEXT: umull v2.8h, v0.8b, v2.8b
-; GISEL-NEXT: cmeq v1.16b, v1.16b, v4.16b
-; GISEL-NEXT: neg v3.16b, v3.16b
-; GISEL-NEXT: uzp2 v2.16b, v2.16b, v5.16b
-; GISEL-NEXT: shl v1.16b, v1.16b, #7
-; GISEL-NEXT: ushl v2.16b, v2.16b, v3.16b
-; GISEL-NEXT: sshr v1.16b, v1.16b, #7
-; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b
+; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI4_0]
+; GISEL-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_3]
+; GISEL-NEXT: umull v1.8h, v0.8b, v1.8b
+; GISEL-NEXT: adrp x8, .LCPI4_1
+; GISEL-NEXT: cmeq v3.16b, v3.16b, v4.16b
+; GISEL-NEXT: uzp2 v1.16b, v1.16b, v2.16b
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
+; GISEL-NEXT: shl v3.16b, v3.16b, #7
+; GISEL-NEXT: neg v2.16b, v2.16b
+; GISEL-NEXT: ushl v1.16b, v1.16b, v2.16b
+; GISEL-NEXT: sshr v2.16b, v3.16b, #7
+; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
; GISEL-NEXT: ret
%div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
ret <16 x i8> %div
@@ -222,54 +222,54 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SDAG-LABEL: pr38477:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI5_0
-; SDAG-NEXT: adrp x9, .LCPI5_4
+; SDAG-NEXT: adrp x9, .LCPI5_3
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; SDAG-NEXT: adrp x8, .LCPI5_1
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_1]
; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: adrp x8, .LCPI5_2
; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v2.8h, v0.8h, v1.8h
-; SDAG-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; SDAG-NEXT: umull v2.4s, v2.4h, v3.4h
-; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_4]
-; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
+; SDAG-NEXT: adrp x8, .LCPI5_2
+; SDAG-NEXT: sub v3.8h, v0.8h, v1.8h
+; SDAG-NEXT: umull2 v4.4s, v3.8h, v2.8h
+; SDAG-NEXT: umull v2.4s, v3.4h, v2.4h
+; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
+; SDAG-NEXT: adrp x8, .LCPI5_4
; SDAG-NEXT: uzp2 v2.8h, v2.8h, v4.8h
+; SDAG-NEXT: ldr q4, [x9, :lo12:.LCPI5_3]
; SDAG-NEXT: add v1.8h, v2.8h, v1.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2]
-; SDAG-NEXT: adrp x8, .LCPI5_3
-; SDAG-NEXT: ushl v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_3]
-; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_4]
+; SDAG-NEXT: ushl v1.8h, v1.8h, v3.8h
+; SDAG-NEXT: and v0.16b, v0.16b, v2.16b
+; SDAG-NEXT: and v1.16b, v1.16b, v4.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
;
; GISEL-LABEL: pr38477:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI5_3
-; GISEL-NEXT: adrp x10, .LCPI5_0
-; GISEL-NEXT: adrp x9, .LCPI5_1
+; GISEL-NEXT: adrp x9, .LCPI5_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_3]
; GISEL-NEXT: adrp x8, .LCPI5_2
-; GISEL-NEXT: ldr q5, [x10, :lo12:.LCPI5_0]
-; GISEL-NEXT: ldr q6, [x9, :lo12:.LCPI5_1]
+; GISEL-NEXT: ldr q5, [x9, :lo12:.LCPI5_0]
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: adrp x8, .LCPI5_4
; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI5_2]
+; GISEL-NEXT: adrp x8, .LCPI5_4
+; GISEL-NEXT: sub v3.8h, v0.8h, v1.8h
+; GISEL-NEXT: umull2 v4.4s, v3.8h, v2.8h
+; GISEL-NEXT: umull v2.4s, v3.4h, v2.4h
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_4]
+; GISEL-NEXT: adrp x8, .LCPI5_1
; GISEL-NEXT: cmeq v3.8h, v3.8h, v5.8h
; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h
-; GISEL-NEXT: neg v4.8h, v6.8h
+; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1]
+; GISEL-NEXT: shl v3.8h, v3.8h, #15
; GISEL-NEXT: add v1.8h, v2.8h, v1.8h
-; GISEL-NEXT: shl v2.8h, v3.8h, #15
-; GISEL-NEXT: ushl v1.8h, v1.8h, v4.8h
-; GISEL-NEXT: sshr v2.8h, v2.8h, #15
+; GISEL-NEXT: neg v2.8h, v4.8h
+; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT: sshr v2.8h, v3.8h, #15
; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index f7247be3c0bf..bc31d41a55f4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -98,10 +98,10 @@ entry:
define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) {
; CHECK-LABEL: dupsext_v2i8_v2i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: dup v1.2s, w8
+; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sshr v0.2s, v0.2s, #24
+; CHECK-NEXT: dup v1.2s, w8
; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 7b3bcacc4f3d..11bcb783cb5c 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -428,10 +428,10 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI24_0
; CHECK-NEXT: dup v0.8b, w0
+; CHECK-NEXT: dup v2.8b, w1
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI24_0]
; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: dup v1.8b, w1
-; CHECK-NEXT: cmhi v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: cmhi v0.8b, v2.8b, v0.8b
; CHECK-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC)
ret <8 x i1> %active.lane.mask
@@ -440,16 +440,16 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
; CHECK-LABEL: lane_mask_v4i1_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v0.4h, w0
; CHECK-NEXT: adrp x8, .LCPI25_0
-; CHECK-NEXT: dup v2.4h, w1
+; CHECK-NEXT: dup v0.4h, w0
+; CHECK-NEXT: movi d2, #0xff00ff00ff00ff
+; CHECK-NEXT: dup v3.4h, w1
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI25_0]
; CHECK-NEXT: bic v0.4h, #255, lsl #8
-; CHECK-NEXT: bic v2.4h, #255, lsl #8
+; CHECK-NEXT: bic v3.4h, #255, lsl #8
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: movi d1, #0xff00ff00ff00ff
-; CHECK-NEXT: umin v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: cmhi v0.4h, v2.4h, v0.4h
+; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h
+; CHECK-NEXT: cmhi v0.4h, v3.4h, v0.4h
; CHECK-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC)
ret <4 x i1> %active.lane.mask
@@ -458,16 +458,16 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI26_0
; CHECK-NEXT: movi d0, #0x0000ff000000ff
; CHECK-NEXT: dup v1.2s, w0
-; CHECK-NEXT: adrp x8, .LCPI26_0
; CHECK-NEXT: dup v3.2s, w1
-; CHECK-NEXT: and v1.8b, v1.8b, v0.8b
; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI26_0]
+; CHECK-NEXT: and v1.8b, v1.8b, v0.8b
; CHECK-NEXT: add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT: and v2.8b, v3.8b, v0.8b
-; CHECK-NEXT: umin v0.2s, v1.2s, v0.2s
-; CHECK-NEXT: cmhi v0.2s, v2.2s, v0.2s
+; CHECK-NEXT: umin v1.2s, v1.2s, v0.2s
+; CHECK-NEXT: and v0.8b, v3.8b, v0.8b
+; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC)
ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
index ee7be0f48a7e..81b7991a7f86 100644
--- a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
+++ b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
@@ -213,9 +213,9 @@ define <4 x i32> @vec_add_const_const_sub_extrause(<4 x i32> %arg) {
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: bl vec_use
-; CHECK-NEXT: mvni v0.4s, #5
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: mvni v0.4s, #5
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
@@ -290,9 +290,9 @@ define <4 x i32> @vec_sub_const_add_const_extrause(<4 x i32> %arg) {
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: bl vec_use
-; CHECK-NEXT: mvni v0.4s, #5
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: mvni v0.4s, #5
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 7934e39b2b69..0b1b581d7792 100644
--- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -68,16 +68,16 @@ define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: add_sub_su64:
; CHECK: // %bb.0:
+; CHECK-NEXT: fmov d2, xzr
; CHECK-NEXT: add d0, d1, d0
-; CHECK-NEXT: fmov d1, xzr
-; CHECK-NEXT: sub d0, d1, d0
+; CHECK-NEXT: sub d0, d2, d0
; CHECK-NEXT: ret
;
; GENERIC-LABEL: add_sub_su64:
; GENERIC: // %bb.0:
+; GENERIC-NEXT: fmov d2, xzr
; GENERIC-NEXT: add d0, d1, d0
-; GENERIC-NEXT: fmov d1, xzr
-; GENERIC-NEXT: sub d0, d1, d0
+; GENERIC-NEXT: sub d0, d2, d0
; GENERIC-NEXT: ret
%vecext = extractelement <2 x i64> %a, i32 0
%vecext1 = extractelement <2 x i64> %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
index 4c90f93b235d..17d937d1f394 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -6,8 +6,8 @@
define float @test1(float %x, float %y) nounwind {
; CHECK-LABEL: test1:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-NEXT: ; kill: def $s1 killed $s1 def $q1
; CHECK-NEXT: bif.16b v0, v1, v2
; CHECK-NEXT: ; kill: def $s0 killed $s0 killed $q0
@@ -55,10 +55,10 @@ define float @test4() nounwind {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
; CHECK-NEXT: bl _bar
-; CHECK-NEXT: mvni.4s v1, #128, lsl #24
; CHECK-NEXT: fcvt s0, d0
-; CHECK-NEXT: fmov s2, #0.50000000
-; CHECK-NEXT: bsl.16b v1, v2, v0
+; CHECK-NEXT: fmov s1, #0.50000000
+; CHECK-NEXT: mvni.4s v2, #128, lsl #24
+; CHECK-NEXT: bif.16b v1, v0, v2
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 764a6b307b17..870190807b0d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -374,8 +374,8 @@ define void @testLeftBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest)
; CHECK-LABEL: testLeftBad2x64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #10
-; CHECK-NEXT: movk x8, #1, lsl #48
; CHECK-NEXT: shl.2d v1, v1, #48
+; CHECK-NEXT: movk x8, #1, lsl #48
; CHECK-NEXT: dup.2d v2, x8
; CHECK-NEXT: and.16b v0, v0, v2
; CHECK-NEXT: orr.16b v0, v0, v1
@@ -405,8 +405,8 @@ define void @testRightBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest
; CHECK-LABEL: testRightBad2x64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #10
-; CHECK-NEXT: movk x8, #1, lsl #48
; CHECK-NEXT: ushr.2d v1, v1, #48
+; CHECK-NEXT: movk x8, #1, lsl #48
; CHECK-NEXT: dup.2d v2, x8
; CHECK-NEXT: and.16b v0, v0, v2
; CHECK-NEXT: orr.16b v0, v0, v1
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 38d574213b9d..50dda82c904b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -177,12 +177,12 @@ define <4 x i64> @sext_v4i8_to_v4i64(<4 x i8> %v0) nounwind {
; CHECK-LABEL: sext_v4i8_to_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ushll2.2d v1, v0, #0
-; CHECK-NEXT: ushll.2d v0, v0, #0
-; CHECK-NEXT: shl.2d v1, v1, #56
+; CHECK-NEXT: ushll.2d v1, v0, #0
+; CHECK-NEXT: ushll2.2d v0, v0, #0
+; CHECK-NEXT: shl.2d v2, v1, #56
; CHECK-NEXT: shl.2d v0, v0, #56
-; CHECK-NEXT: sshr.2d v1, v1, #56
-; CHECK-NEXT: sshr.2d v0, v0, #56
+; CHECK-NEXT: sshr.2d v1, v0, #56
+; CHECK-NEXT: sshr.2d v0, v2, #56
; CHECK-NEXT: ret
%r = sext <4 x i8> %v0 to <4 x i64>
ret <4 x i64> %r
@@ -192,12 +192,12 @@ define <8 x i64> @zext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
; CHECK-LABEL: zext_v8i8_to_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll.8h v0, v0, #0
-; CHECK-NEXT: ushll.4s v2, v0, #0
-; CHECK-NEXT: ushll2.4s v4, v0, #0
-; CHECK-NEXT: ushll2.2d v1, v2, #0
-; CHECK-NEXT: ushll.2d v0, v2, #0
-; CHECK-NEXT: ushll2.2d v3, v4, #0
-; CHECK-NEXT: ushll.2d v2, v4, #0
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll2.2d v3, v2, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ushll.2d v2, v2, #0
; CHECK-NEXT: ret
%r = zext <8 x i8> %v0 to <8 x i64>
ret <8 x i64> %r
@@ -207,12 +207,12 @@ define <8 x i64> @sext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
; CHECK-LABEL: sext_v8i8_to_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll.8h v0, v0, #0
-; CHECK-NEXT: sshll.4s v2, v0, #0
-; CHECK-NEXT: sshll2.4s v4, v0, #0
-; CHECK-NEXT: sshll2.2d v1, v2, #0
-; CHECK-NEXT: sshll.2d v0, v2, #0
-; CHECK-NEXT: sshll2.2d v3, v4, #0
-; CHECK-NEXT: sshll.2d v2, v4, #0
+; CHECK-NEXT: sshll2.4s v2, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: sshll2.2d v3, v2, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: sshll.2d v2, v2, #0
; CHECK-NEXT: ret
%r = sext <8 x i8> %v0 to <8 x i64>
ret <8 x i64> %r
@@ -496,129 +496,129 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
; CHECK-NEXT: ldr w9, [sp, #64]
; CHECK-NEXT: ldr w10, [sp, #192]
; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ldr w8, [sp, #72]
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: ldr w9, [sp, #200]
-; CHECK-NEXT: fmov s1, w10
-; CHECK-NEXT: ldr w10, [sp, #328]
+; CHECK-NEXT: ldr w8, [sp, #328]
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: ldr w9, [sp, #72]
+; CHECK-NEXT: fmov s2, w10
+; CHECK-NEXT: ldr w10, [sp, #80]
+; CHECK-NEXT: mov.b v0[1], w8
+; CHECK-NEXT: ldr w8, [sp, #200]
+; CHECK-NEXT: mov.b v1[1], w9
+; CHECK-NEXT: ldr w9, [sp, #336]
; CHECK-NEXT: mov.b v3[1], w1
-; CHECK-NEXT: ldr w11, [sp, #344]
+; CHECK-NEXT: ldr w11, [sp, #88]
; CHECK-NEXT: mov.b v2[1], w8
-; CHECK-NEXT: ldr w8, [sp, #336]
-; CHECK-NEXT: mov.b v1[1], w9
-; CHECK-NEXT: ldr w9, [sp, #80]
-; CHECK-NEXT: mov.b v0[1], w10
-; CHECK-NEXT: ldr w10, [sp, #208]
+; CHECK-NEXT: ldr w8, [sp, #344]
+; CHECK-NEXT: mov.b v0[2], w9
+; CHECK-NEXT: ldr w9, [sp, #208]
+; CHECK-NEXT: mov.b v1[2], w10
+; CHECK-NEXT: ldr w10, [sp, #352]
; CHECK-NEXT: mov.b v3[2], w2
-; CHECK-NEXT: ldr w12, [sp, #360]
+; CHECK-NEXT: ldr w12, [sp, #96]
; CHECK-NEXT: mov.b v2[2], w9
-; CHECK-NEXT: ldr w9, [sp, #352]
-; CHECK-NEXT: mov.b v1[2], w10
-; CHECK-NEXT: ldr w10, [sp, #88]
-; CHECK-NEXT: mov.b v0[2], w8
+; CHECK-NEXT: ldr w9, [sp, #360]
+; CHECK-NEXT: mov.b v0[3], w8
; CHECK-NEXT: ldr w8, [sp, #216]
+; CHECK-NEXT: mov.b v1[3], w11
+; CHECK-NEXT: ldr w13, [sp, #104]
; CHECK-NEXT: mov.b v3[3], w3
-; CHECK-NEXT: ldr w13, [sp, #376]
-; CHECK-NEXT: mov.b v2[3], w10
-; CHECK-NEXT: ldr w10, [sp, #368]
-; CHECK-NEXT: mov.b v1[3], w8
-; CHECK-NEXT: ldr w8, [sp, #96]
-; CHECK-NEXT: mov.b v0[3], w11
-; CHECK-NEXT: ldr w11, [sp, #224]
+; CHECK-NEXT: ldr w11, [sp, #368]
+; CHECK-NEXT: mov.b v2[3], w8
+; CHECK-NEXT: ldr w14, [sp, #112]
+; CHECK-NEXT: mov.b v0[4], w10
+; CHECK-NEXT: ldr w10, [sp, #224]
+; CHECK-NEXT: mov.b v1[4], w12
+; CHECK-NEXT: ldr w8, [sp, #376]
; CHECK-NEXT: mov.b v3[4], w4
-; CHECK-NEXT: ldr w14, [sp, #392]
-; CHECK-NEXT: mov.b v2[4], w8
-; CHECK-NEXT: ldr w8, [sp, #384]
-; CHECK-NEXT: mov.b v1[4], w11
-; CHECK-NEXT: ldr w11, [sp, #104]
-; CHECK-NEXT: mov.b v0[4], w9
+; CHECK-NEXT: ldr w15, [sp, #120]
+; CHECK-NEXT: mov.b v2[4], w10
+; CHECK-NEXT: ldr w12, [sp, #384]
+; CHECK-NEXT: mov.b v0[5], w9
; CHECK-NEXT: ldr w9, [sp, #232]
+; CHECK-NEXT: mov.b v1[5], w13
+; CHECK-NEXT: ldr w16, [sp, #128]
; CHECK-NEXT: mov.b v3[5], w5
-; CHECK-NEXT: ldr w15, [sp, #408]
-; CHECK-NEXT: mov.b v2[5], w11
-; CHECK-NEXT: ldr w11, [sp, #400]
-; CHECK-NEXT: mov.b v1[5], w9
-; CHECK-NEXT: ldr w9, [sp, #112]
-; CHECK-NEXT: mov.b v0[5], w12
-; CHECK-NEXT: ldr w12, [sp, #240]
+; CHECK-NEXT: ldr w10, [sp, #392]
+; CHECK-NEXT: mov.b v2[5], w9
+; CHECK-NEXT: ldr w13, [sp, #400]
+; CHECK-NEXT: mov.b v0[6], w11
+; CHECK-NEXT: ldr w11, [sp, #240]
+; CHECK-NEXT: mov.b v1[6], w14
+; CHECK-NEXT: ldr w9, [sp, #408]
; CHECK-NEXT: mov.b v3[6], w6
-; CHECK-NEXT: ldr w16, [sp, #424]
-; CHECK-NEXT: mov.b v2[6], w9
-; CHECK-NEXT: ldr w9, [sp, #416]
-; CHECK-NEXT: mov.b v1[6], w12
-; CHECK-NEXT: ldr w12, [sp, #120]
-; CHECK-NEXT: mov.b v0[6], w10
-; CHECK-NEXT: ldr w10, [sp, #248]
+; CHECK-NEXT: ldr w14, [sp, #416]
+; CHECK-NEXT: mov.b v2[6], w11
+; CHECK-NEXT: ldr w11, [sp, #424]
+; CHECK-NEXT: mov.b v0[7], w8
+; CHECK-NEXT: ldr w8, [sp, #248]
+; CHECK-NEXT: mov.b v1[7], w15
+; CHECK-NEXT: ldr w15, [sp, #432]
; CHECK-NEXT: mov.b v3[7], w7
-; CHECK-NEXT: mov.b v2[7], w12
-; CHECK-NEXT: ldr w12, [sp]
-; CHECK-NEXT: mov.b v1[7], w10
-; CHECK-NEXT: ldr w10, [sp, #128]
-; CHECK-NEXT: mov.b v0[7], w13
-; CHECK-NEXT: ldr w13, [sp, #256]
-; CHECK-NEXT: mov.b v3[8], w12
-; CHECK-NEXT: ldr w12, [sp, #432]
-; CHECK-NEXT: mov.b v2[8], w10
-; CHECK-NEXT: ldr w10, [sp, #8]
-; CHECK-NEXT: mov.b v1[8], w13
-; CHECK-NEXT: ldr w13, [sp, #136]
-; CHECK-NEXT: mov.b v0[8], w8
-; CHECK-NEXT: ldr w8, [sp, #264]
-; CHECK-NEXT: mov.b v3[9], w10
-; CHECK-NEXT: ldr w10, [sp, #440]
-; CHECK-NEXT: mov.b v2[9], w13
-; CHECK-NEXT: ldr w13, [sp, #16]
+; CHECK-NEXT: mov.b v2[7], w8
+; CHECK-NEXT: ldr w8, [sp]
+; CHECK-NEXT: mov.b v0[8], w12
+; CHECK-NEXT: ldr w12, [sp, #256]
+; CHECK-NEXT: mov.b v1[8], w16
+; CHECK-NEXT: ldr w16, [sp, #440]
+; CHECK-NEXT: mov.b v3[8], w8
+; CHECK-NEXT: ldr w8, [sp, #136]
+; CHECK-NEXT: mov.b v2[8], w12
+; CHECK-NEXT: ldr w12, [sp, #8]
+; CHECK-NEXT: mov.b v0[9], w10
+; CHECK-NEXT: ldr w10, [sp, #264]
; CHECK-NEXT: mov.b v1[9], w8
-; CHECK-NEXT: ldr w8, [sp, #144]
-; CHECK-NEXT: mov.b v0[9], w14
-; CHECK-NEXT: ldr w14, [sp, #272]
-; CHECK-NEXT: mov.b v3[10], w13
+; CHECK-NEXT: ldr w8, [sp, #272]
+; CHECK-NEXT: mov.b v3[9], w12
+; CHECK-NEXT: ldr w12, [sp, #144]
+; CHECK-NEXT: mov.b v2[9], w10
+; CHECK-NEXT: ldr w10, [sp, #16]
+; CHECK-NEXT: mov.b v0[10], w13
; CHECK-NEXT: ldr w13, [sp, #280]
+; CHECK-NEXT: mov.b v1[10], w12
+; CHECK-NEXT: ldr w12, [sp, #152]
+; CHECK-NEXT: mov.b v3[10], w10
+; CHECK-NEXT: ldr w10, [sp, #160]
; CHECK-NEXT: mov.b v2[10], w8
; CHECK-NEXT: ldr w8, [sp, #24]
-; CHECK-NEXT: mov.b v1[10], w14
-; CHECK-NEXT: ldr w14, [sp, #152]
-; CHECK-NEXT: mov.b v0[10], w11
-; CHECK-NEXT: ldr w11, [sp, #288]
+; CHECK-NEXT: mov.b v0[11], w9
+; CHECK-NEXT: ldr w9, [sp, #288]
+; CHECK-NEXT: mov.b v1[11], w12
+; CHECK-NEXT: ldr w12, [sp, #296]
; CHECK-NEXT: mov.b v3[11], w8
; CHECK-NEXT: ldr w8, [sp, #32]
-; CHECK-NEXT: mov.b v2[11], w14
-; CHECK-NEXT: ldr w14, [sp, #296]
-; CHECK-NEXT: mov.b v1[11], w13
-; CHECK-NEXT: ldr w13, [sp, #160]
-; CHECK-NEXT: mov.b v0[11], w15
+; CHECK-NEXT: mov.b v2[11], w13
+; CHECK-NEXT: mov.b v0[12], w14
+; CHECK-NEXT: mov.b v1[12], w10
+; CHECK-NEXT: ldr w10, [sp, #168]
; CHECK-NEXT: mov.b v3[12], w8
; CHECK-NEXT: ldr w8, [sp, #40]
-; CHECK-NEXT: mov.b v2[12], w13
-; CHECK-NEXT: ldr w13, [sp, #312]
-; CHECK-NEXT: mov.b v1[12], w11
-; CHECK-NEXT: ldr w11, [sp, #168]
-; CHECK-NEXT: mov.b v0[12], w9
+; CHECK-NEXT: mov.b v2[12], w9
; CHECK-NEXT: ldr w9, [sp, #304]
+; CHECK-NEXT: mov.b v0[13], w11
+; CHECK-NEXT: ldr w11, [sp, #312]
+; CHECK-NEXT: mov.b v1[13], w10
+; CHECK-NEXT: ldr w10, [sp, #176]
; CHECK-NEXT: mov.b v3[13], w8
; CHECK-NEXT: ldr w8, [sp, #48]
-; CHECK-NEXT: mov.b v2[13], w11
-; CHECK-NEXT: ldr w11, [sp, #176]
-; CHECK-NEXT: mov.b v1[13], w14
-; CHECK-NEXT: mov.b v0[13], w16
+; CHECK-NEXT: mov.b v2[13], w12
+; CHECK-NEXT: mov.b v0[14], w15
+; CHECK-NEXT: mov.b v1[14], w10
+; CHECK-NEXT: ldr w10, [sp, #184]
; CHECK-NEXT: mov.b v3[14], w8
; CHECK-NEXT: ldr w8, [sp, #56]
-; CHECK-NEXT: mov.b v2[14], w11
-; CHECK-NEXT: mov.b v1[14], w9
-; CHECK-NEXT: ldr w9, [sp, #184]
-; CHECK-NEXT: mov.b v0[14], w12
+; CHECK-NEXT: mov.b v2[14], w9
+; CHECK-NEXT: mov.b v0[15], w16
+; CHECK-NEXT: mov.b v1[15], w10
; CHECK-NEXT: mov.b v3[15], w8
-; CHECK-NEXT: mov.b v2[15], w9
-; CHECK-NEXT: mov.b v1[15], w13
-; CHECK-NEXT: mov.b v0[15], w10
+; CHECK-NEXT: mov.b v2[15], w11
+; CHECK-NEXT: shl.16b v4, v0, #7
+; CHECK-NEXT: shl.16b v1, v1, #7
; CHECK-NEXT: shl.16b v3, v3, #7
; CHECK-NEXT: shl.16b v2, v2, #7
-; CHECK-NEXT: shl.16b v4, v1, #7
-; CHECK-NEXT: shl.16b v5, v0, #7
; CHECK-NEXT: cmlt.16b v0, v3, #0
-; CHECK-NEXT: cmlt.16b v1, v2, #0
-; CHECK-NEXT: cmlt.16b v2, v4, #0
-; CHECK-NEXT: cmlt.16b v3, v5, #0
+; CHECK-NEXT: cmlt.16b v1, v1, #0
+; CHECK-NEXT: cmlt.16b v2, v2, #0
+; CHECK-NEXT: cmlt.16b v3, v4, #0
; CHECK-NEXT: ret
%res = sext <64 x i1> %arg to <64 x i8>
ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index 4f365171de05..396d9efe4566 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -787,10 +787,10 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind {
; CHECK-LABEL: hadd32_sext_lsr:
; CHECK: // %bb.0:
-; CHECK-NEXT: saddl2.2d v2, v0, v1
-; CHECK-NEXT: saddl.2d v0, v0, v1
-; CHECK-NEXT: ushr.2d v1, v2, #1
-; CHECK-NEXT: ushr.2d v0, v0, #1
+; CHECK-NEXT: saddl.2d v2, v0, v1
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ushr.2d v1, v0, #1
+; CHECK-NEXT: ushr.2d v0, v2, #1
; CHECK-NEXT: ret
%zextsrc1 = sext <4 x i32> %src1 to <4 x i64>
%zextsrc2 = sext <4 x i32> %src2 to <4 x i64>
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index 44d0eed3d723..abf9469c45ef 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -178,10 +178,10 @@ define <4 x i32> @sign_4xi32_multi_use(<4 x i32> %a) {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
; CHECK-NEXT: cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
; CHECK-NEXT: orr v2.4s, #1
-; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: cmgt v1.4s, v0.4s, v1.4s
; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: xtn v0.4h, v1.4s
; CHECK-NEXT: bl use_4xi1
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
@@ -198,10 +198,10 @@ define <4 x i32> @not_sign_4xi32(<4 x i32> %a) {
; CHECK-LABEL: not_sign_4xi32:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %a, <i32 1, i32 -1, i32 -1, i32 -1>
@@ -229,10 +229,10 @@ define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
; CHECK-NEXT: adrp x8, .LCPI18_0
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT: movi v2.4s, #1
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_0]
; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: bsl v0.16b, v2.16b, v3.16b
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
%res = select <4 x i1> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
index e4f13f5c98a1..3ac8a1877272 100644
--- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
+++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
@@ -8,15 +8,15 @@ define void @signbits_vXi1(<4 x i16> %a1) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w1, wzr
+; CHECK-NEXT: movi v2.4h, #1
; CHECK-NEXT: dup v0.4h, v0.h[0]
+; CHECK-NEXT: mov w1, wzr
; CHECK-NEXT: mov w2, wzr
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_1
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: movi v1.4h, #1
-; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_1]
+; CHECK-NEXT: cmgt v0.4h, v2.4h, v0.4h
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
index b278e9cd06da..72e9a1e710f1 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
@@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
; ALL-NEXT: sdiv x9, x9, x8
; ALL-NEXT: mul x8, x9, x8
; ALL-NEXT: sdiv x11, x11, x10
+; ALL-NEXT: fmov d2, x9
; ALL-NEXT: fmov d1, x8
; ALL-NEXT: mul x10, x11, x10
+; ALL-NEXT: mov v2.d[1], x11
; ALL-NEXT: mov v1.d[1], x10
+; ALL-NEXT: str q2, [x0]
; ALL-NEXT: sub v0.2d, v0.2d, v1.2d
-; ALL-NEXT: fmov d1, x9
-; ALL-NEXT: mov v1.d[1], x11
-; ALL-NEXT: str q1, [x0]
; ALL-NEXT: ret
%div = sdiv <2 x i64> %x, %y
store <2 x i64> %div, <2 x i64>* %divdst, align 16
diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
index af5781956791..c514cc99f014 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
@@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst
; ALL-NEXT: udiv x9, x9, x8
; ALL-NEXT: mul x8, x9, x8
; ALL-NEXT: udiv x11, x11, x10
+; ALL-NEXT: fmov d2, x9
; ALL-NEXT: fmov d1, x8
; ALL-NEXT: mul x10, x11, x10
+; ALL-NEXT: mov v2.d[1], x11
; ALL-NEXT: mov v1.d[1], x10
+; ALL-NEXT: str q2, [x0]
; ALL-NEXT: sub v0.2d, v0.2d, v1.2d
-; ALL-NEXT: fmov d1, x9
-; ALL-NEXT: mov v1.d[1], x11
-; ALL-NEXT: str q1, [x0]
; ALL-NEXT: ret
%div = udiv <2 x i64> %x, %y
store <2 x i64> %div, <2 x i64>* %divdst, align 16
diff --git a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
index b75913dabadf..de9a0fe9b23a 100644
--- a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
+++ b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
@@ -7,14 +7,14 @@ define <2 x i16> @rotlv2_16(<2 x i16> %vec2_16, <2 x i16> %shift) {
; CHECK-LABEL: rotlv2_16:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.2s, #15
-; CHECK-NEXT: neg v3.2s, v1.2s
-; CHECK-NEXT: movi d4, #0x00ffff0000ffff
-; CHECK-NEXT: and v3.8b, v3.8b, v2.8b
+; CHECK-NEXT: movi d3, #0x00ffff0000ffff
+; CHECK-NEXT: neg v4.2s, v1.2s
+; CHECK-NEXT: and v4.8b, v4.8b, v2.8b
+; CHECK-NEXT: and v3.8b, v0.8b, v3.8b
+; CHECK-NEXT: neg v4.2s, v4.2s
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: and v4.8b, v0.8b, v4.8b
-; CHECK-NEXT: neg v3.2s, v3.2s
; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: ushl v2.2s, v4.2s, v3.2s
+; CHECK-NEXT: ushl v2.2s, v3.2s, v4.2s
; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b
; CHECK-NEXT: ret
%1 = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %vec2_16, <2 x i16> %vec2_16, <2 x i16> %shift)
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 7faa4f8ec6fa..f50f566703a7 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1101,9 +1101,9 @@ define half @test_maxnum(half %a, half %b) #0 {
}
; CHECK-CVT-LABEL: test_copysign:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: fcvt s1, h1
; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: bif.16b v0, v1, v2
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: ret
@@ -1119,15 +1119,15 @@ define half @test_copysign(half %a, half %b) #0 {
}
; CHECK-CVT-LABEL: test_copysign_f32:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: bif.16b v0, v1, v2
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: ret
; CHECK-FP16-LABEL: test_copysign_f32:
-; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
; CHECK-FP16-NEXT: fcvt h1, s1
+; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
; CHECK-FP16-NEXT: bif.16b v0, v1, v2
; CHECK-FP16-NEXT: ret
@@ -1138,16 +1138,16 @@ define half @test_copysign_f32(half %a, float %b) #0 {
}
; CHECK-CVT-LABEL: test_copysign_f64:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: fcvt s1, d1
; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: bif.16b v0, v1, v2
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: ret
; CHECK-FP16-LABEL: test_copysign_f64:
-; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
; CHECK-FP16-NEXT: fcvt h1, d1
+; CHECK-FP16-NEXT: mvni.8h v2, #128, lsl #8
; CHECK-FP16-NEXT: bif.16b v0, v1, v2
; CHECK-FP16-NEXT: ret
@@ -1161,9 +1161,9 @@ define half @test_copysign_f64(half %a, double %b) #0 {
; away the (fpext (fp_round <result>)) here.
; CHECK-CVT-LABEL: test_copysign_extended:
-; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: fcvt s1, h1
; CHECK-CVT-NEXT: fcvt s0, h0
+; CHECK-CVT-NEXT: mvni.4s v2, #128, lsl #24
; CHECK-CVT-NEXT: bif.16b v0, v1, v2
; CHECK-CVT-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index b012b6493901..ff93ff77a1a0 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -95,8 +95,8 @@ entry:
define float @copysign32(float %a, float %b) {
; CHECK-LABEL: copysign32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mvni v2.4s, #128, lsl #24
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT: mvni v2.4s, #128, lsl #24
; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
@@ -142,9 +142,9 @@ entry:
define half @copysign16(half %a, half %b) {
; CHECK-LABEL: copysign16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mvni v2.4s, #128, lsl #24
; CHECK-NEXT: fcvt s1, h1
; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: mvni v2.4s, #128, lsl #24
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 3625bd6011fb..ad3f2b3963b5 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -295,11 +295,11 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-NEXT: .cfi_offset w21, -24
; CHECK-NEXT: .cfi_offset w22, -32
; CHECK-NEXT: .cfi_offset w30, -48
-; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: adrp x8, .LCPI15_0
; CHECK-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -364,12 +364,12 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-NEXT: .cfi_offset w21, -24
; CHECK-NEXT: .cfi_offset w22, -32
; CHECK-NEXT: .cfi_offset w30, -48
-; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill
; CHECK-NEXT: adrp x8, .LCPI16_0
+; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill
; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -454,11 +454,11 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-NEXT: .cfi_offset w21, -24
; CHECK-NEXT: .cfi_offset w22, -32
; CHECK-NEXT: .cfi_offset w30, -48
-; CHECK-NEXT: stp q2, q3, [sp, #64] // 32-byte Folded Spill
; CHECK-NEXT: adrp x8, .LCPI17_0
+; CHECK-NEXT: stp q2, q3, [sp, #64] // 32-byte Folded Spill
; CHECK-NEXT: mov v2.16b, v1.16b
-; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
@@ -697,9 +697,9 @@ define <2 x i1> @test_signed_v2f32_v2i1(<2 x float> %f) {
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: fcvtzs v0.2s, v0.2s
+; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s
; CHECK-NEXT: ret
%x = call <2 x i1> @llvm.fptosi.sat.v2f32.v2i1(<2 x float> %f)
ret <2 x i1> %x
@@ -1628,9 +1628,9 @@ define <4 x i1> @test_signed_v4f16_v4i1(<4 x half> %f) {
; CHECK-FP16: // %bb.0:
; CHECK-FP16-NEXT: movi v1.2d, #0000000000000000
; CHECK-FP16-NEXT: fcvtzs v0.4h, v0.4h
+; CHECK-FP16-NEXT: movi v2.2d, #0xffffffffffffffff
; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v1.4h
-; CHECK-FP16-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v1.4h
+; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v2.4h
; CHECK-FP16-NEXT: ret
%x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f)
ret <4 x i1> %x
@@ -1674,10 +1674,10 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) {
;
; CHECK-FP16-LABEL: test_signed_v4f16_v4i13:
; CHECK-FP16: // %bb.0:
-; CHECK-FP16-NEXT: mvni v1.4h, #240, lsl #8
; CHECK-FP16-NEXT: fcvtzs v0.4h, v0.4h
-; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v1.4h
; CHECK-FP16-NEXT: movi v1.4h, #240, lsl #8
+; CHECK-FP16-NEXT: mvni v2.4h, #240, lsl #8
+; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v2.4h
; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v1.4h
; CHECK-FP16-NEXT: ret
%x = call <4 x i13> @llvm.fptosi.sat.v4f16.v4i13(<4 x half> %f)
@@ -2129,9 +2129,9 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
; CHECK-FP16: // %bb.0:
; CHECK-FP16-NEXT: movi v1.2d, #0000000000000000
; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h
+; CHECK-FP16-NEXT: movi v2.2d, #0xffffffffffffffff
; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v1.8h
-; CHECK-FP16-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v1.8h
+; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v2.8h
; CHECK-FP16-NEXT: xtn v0.8b, v0.8h
; CHECK-FP16-NEXT: ret
%x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f)
@@ -2278,10 +2278,10 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
;
; CHECK-FP16-LABEL: test_signed_v8f16_v8i13:
; CHECK-FP16: // %bb.0:
-; CHECK-FP16-NEXT: mvni v1.8h, #240, lsl #8
; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h
-; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v1.8h
; CHECK-FP16-NEXT: movi v1.8h, #240, lsl #8
+; CHECK-FP16-NEXT: mvni v2.8h, #240, lsl #8
+; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v2.8h
; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v1.8h
; CHECK-FP16-NEXT: ret
%x = call <8 x i13> @llvm.fptosi.sat.v8f16.v8i13(<8 x half> %f)
@@ -2366,21 +2366,21 @@ define <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
; CHECK-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NEXT: movi v1.4s, #3, msl #16
-; CHECK-NEXT: mvni v3.4s, #3, msl #16
; CHECK-NEXT: fcvtzs v2.4s, v2.4s
; CHECK-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-NEXT: smin v2.4s, v2.4s, v1.4s
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: smax v1.4s, v2.4s, v3.4s
-; CHECK-NEXT: smax v0.4s, v0.4s, v3.4s
-; CHECK-NEXT: mov w1, v1.s[1]
-; CHECK-NEXT: mov w2, v1.s[2]
+; CHECK-NEXT: mvni v1.4s, #3, msl #16
+; CHECK-NEXT: smax v2.4s, v2.4s, v1.4s
+; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov w1, v2.s[1]
+; CHECK-NEXT: mov w2, v2.s[2]
; CHECK-NEXT: mov w5, v0.s[1]
-; CHECK-NEXT: mov w3, v1.s[3]
+; CHECK-NEXT: mov w3, v2.s[3]
; CHECK-NEXT: mov w6, v0.s[2]
; CHECK-NEXT: mov w7, v0.s[3]
; CHECK-NEXT: fmov w4, s0
-; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: fmov w0, s2
; CHECK-NEXT: ret
%x = call <8 x i19> @llvm.fptosi.sat.v8f16.v8i19(<8 x half> %f)
ret <8 x i19> %x
@@ -2995,11 +2995,11 @@ define <8 x i8> @test_signed_v8f32_v8i8(<8 x float> %f) {
; CHECK-NEXT: movi v2.4s, #127
; CHECK-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-NEXT: mvni v3.4s, #127
; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: smax v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: smax v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: mvni v2.4s, #127
+; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index ace519684215..cbb8b8a51126 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -285,11 +285,11 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: adrp x8, .LCPI15_0
; CHECK-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -338,12 +338,12 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill
; CHECK-NEXT: adrp x8, .LCPI16_0
+; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill
; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -406,13 +406,13 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-NEXT: .cfi_offset w19, -8
; CHECK-NEXT: .cfi_offset w20, -16
; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: stp q0, q2, [sp, #16] // 32-byte Folded Spill
; CHECK-NEXT: adrp x8, .LCPI17_0
+; CHECK-NEXT: stp q0, q2, [sp, #16] // 32-byte Folded Spill
; CHECK-NEXT: mov v2.16b, v1.16b
; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: str q3, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: str q1, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -1424,8 +1424,8 @@ define <4 x i13> @test_unsigned_v4f16_v4i13(<4 x half> %f) {
;
; CHECK-FP16-LABEL: test_unsigned_v4f16_v4i13:
; CHECK-FP16: // %bb.0:
-; CHECK-FP16-NEXT: mvni v1.4h, #224, lsl #8
; CHECK-FP16-NEXT: fcvtzu v0.4h, v0.4h
+; CHECK-FP16-NEXT: mvni v1.4h, #224, lsl #8
; CHECK-FP16-NEXT: umin v0.4h, v0.4h, v1.4h
; CHECK-FP16-NEXT: ret
%x = call <4 x i13> @llvm.fptoui.sat.v4f16.v4i13(<4 x half> %f)
@@ -1910,8 +1910,8 @@ define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
;
; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i13:
; CHECK-FP16: // %bb.0:
-; CHECK-FP16-NEXT: mvni v1.8h, #224, lsl #8
; CHECK-FP16-NEXT: fcvtzu v0.8h, v0.8h
+; CHECK-FP16-NEXT: mvni v1.8h, #224, lsl #8
; CHECK-FP16-NEXT: umin v0.8h, v0.8h, v1.8h
; CHECK-FP16-NEXT: ret
%x = call <8 x i13> @llvm.fptoui.sat.v8f16.v8i13(<8 x half> %f)
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
index e62f84f74671..bb37cc81a7ab 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -81,8 +81,8 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
; CHECK-NEXT: neg v3.4s, v1.4s
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: neg v2.4s, v2.4s
+; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
@@ -170,8 +170,8 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
; CHECK-NEXT: movi v2.4s, #31
; CHECK-NEXT: neg v3.4s, v1.4s
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v2.16b, v3.16b, v2.16b
; CHECK-NEXT: neg v1.4s, v1.4s
+; CHECK-NEXT: and v2.16b, v3.16b, v2.16b
; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s
; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 3445968721c8..63102a3d146e 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -92,17 +92,17 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(<vscale x 8 x
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: cnth x8
-; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: sub x8, x8, #8
; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: cmp x8, #8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: lsl x8, x8, #1
; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NEXT: st1h { z0.h }, p0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
@@ -136,17 +136,17 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(<vscale x 4
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: cntw x8
-; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sub x8, x8, #4
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x8, #4
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: lsl x8, x8, #2
; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: st1w { z0.s }, p0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
@@ -180,17 +180,17 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(<vscale x 2
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: cntd x8
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sub x8, x8, #2
; CHECK-NEXT: mov w9, #2
; CHECK-NEXT: cmp x8, #2
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: csel x8, x8, x9, lo
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: lsl x8, x8, #3
; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
diff --git a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
index 8f76a1fea511..b72422be759f 100644
--- a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
+++ b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
@@ -22,9 +22,9 @@ define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2)
; CHECK-LABEL: mlai16_and:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
+; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <4 x i16> %vec0 to <4 x i32>
@@ -158,9 +158,9 @@ define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2)
; CHECK-LABEL: mlai32_and:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s
-; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <2 x i32> %vec0 to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
index af6475732a81..3dad36acdf63 100644
--- a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
@@ -1079,8 +1079,8 @@ define <4 x i32> @notted_smin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_smin_bc_ab:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1101,8 +1101,8 @@ define <4 x i32> @notted_smin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_smin_bc_ba:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1167,8 +1167,8 @@ define <4 x i32> @notted_smin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_smin_bc_ab_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1189,8 +1189,8 @@ define <4 x i32> @notted_smin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_smin_bc_ba_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1255,8 +1255,8 @@ define <4 x i32> @notted_smin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_smin_bc_ab_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1277,8 +1277,8 @@ define <4 x i32> @notted_smin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_smin_bc_ba_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1343,8 +1343,8 @@ define <4 x i32> @notted_smin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_smin_bc_ab_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1365,8 +1365,8 @@ define <4 x i32> @notted_smin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_smin_bc_ba_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s
@@ -1431,8 +1431,8 @@ define <4 x i32> @notted_smax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_smax_bc_ab:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1453,8 +1453,8 @@ define <4 x i32> @notted_smax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_smax_bc_ba:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1519,8 +1519,8 @@ define <4 x i32> @notted_smax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_smax_bc_ab_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1541,8 +1541,8 @@ define <4 x i32> @notted_smax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_smax_bc_ba_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1607,8 +1607,8 @@ define <4 x i32> @notted_smax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_smax_bc_ab_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1629,8 +1629,8 @@ define <4 x i32> @notted_smax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_smax_bc_ba_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1695,8 +1695,8 @@ define <4 x i32> @notted_smax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_smax_bc_ab_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1717,8 +1717,8 @@ define <4 x i32> @notted_smax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_smax_bc_ba_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s
@@ -1783,8 +1783,8 @@ define <4 x i32> @notted_umin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_umin_bc_ab:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -1805,8 +1805,8 @@ define <4 x i32> @notted_umin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_umin_bc_ba:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -1871,8 +1871,8 @@ define <4 x i32> @notted_umin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_umin_bc_ab_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -1893,8 +1893,8 @@ define <4 x i32> @notted_umin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_umin_bc_ba_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -1959,8 +1959,8 @@ define <4 x i32> @notted_umin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_umin_bc_ab_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -1981,8 +1981,8 @@ define <4 x i32> @notted_umin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_umin_bc_ba_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -2047,8 +2047,8 @@ define <4 x i32> @notted_umin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_umin_bc_ab_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -2069,8 +2069,8 @@ define <4 x i32> @notted_umin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_umin_bc_ba_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s
@@ -2135,8 +2135,8 @@ define <4 x i32> @notted_umax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_umax_bc_ab:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2157,8 +2157,8 @@ define <4 x i32> @notted_umax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
; CHECK-LABEL: notted_umax_bc_ba:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2223,8 +2223,8 @@ define <4 x i32> @notted_umax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_umax_bc_ab_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2245,8 +2245,8 @@ define <4 x i32> @notted_umax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
; CHECK-LABEL: notted_umax_bc_ba_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2311,8 +2311,8 @@ define <4 x i32> @notted_umax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_umax_bc_ab_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2333,8 +2333,8 @@ define <4 x i32> @notted_umax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
; CHECK-LABEL: notted_umax_bc_ba_eq_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2399,8 +2399,8 @@ define <4 x i32> @notted_umax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_umax_bc_ab_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
@@ -2421,8 +2421,8 @@ define <4 x i32> @notted_umax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
; CHECK-LABEL: notted_umax_bc_ba_eq_swap_pred:
; CHECK: // %bb.0:
; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: mvn v2.16b, v2.16b
; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s
; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s
; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/minmax.ll b/llvm/test/CodeGen/AArch64/minmax.ll
index 74f6b894eef8..59faf0efc35d 100644
--- a/llvm/test/CodeGen/AArch64/minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax.ll
@@ -122,10 +122,10 @@ define <16 x i32> @t11(<16 x i32> %a, <16 x i32> %b) {
define <16 x i8> @t12(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: t12:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.16b, #1
-; CHECK-NEXT: cmhi v3.16b, v1.16b, v0.16b
-; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v1.16b, v3.16b, v2.16b
+; CHECK-NEXT: cmhi v2.16b, v1.16b, v0.16b
+; CHECK-NEXT: movi v3.16b, #1
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v2.16b, v3.16b
; CHECK-NEXT: add v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
%t1 = icmp ugt <16 x i8> %b, %a
diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
index 2bcb7ca696d1..8cd45160fcf4 100644
--- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
+++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
@@ -13,9 +13,9 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %d
; CHECK-NEXT: mul v0.8h, v2.8h, v0.8h
; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h
; CHECK-NEXT: add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: str q2, [x9, x8]
; CHECK-NEXT: ldr x9, [x2, #56]
+; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: str q0, [x9, x8]
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 9f9e459b7354..75c0355965eb 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -119,12 +119,12 @@ define dso_local void @run_test() local_unnamed_addr #0 {
; CHECK-NEXT: add v0.2d, v0.2d, v15.2d
; CHECK-NEXT: add v11.2d, v11.2d, v14.2d
; CHECK-NEXT: fmov d14, x3
-; CHECK-NEXT: add v9.2d, v9.2d, v1.2d
; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: fmov d0, x13
+; CHECK-NEXT: add v9.2d, v9.2d, v1.2d
; CHECK-NEXT: mov v14.d[1], x15
-; CHECK-NEXT: add v31.2d, v31.2d, v1.2d
; CHECK-NEXT: mov v0.d[1], x12
+; CHECK-NEXT: add v31.2d, v31.2d, v1.2d
; CHECK-NEXT: add v26.2d, v26.2d, v1.2d
; CHECK-NEXT: add v23.2d, v23.2d, v1.2d
; CHECK-NEXT: add v21.2d, v21.2d, v1.2d
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index ecc94ccc2f79..917f14689026 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -97,9 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
; CHECK-LABEL: v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: sqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: sqadd v0.8b, v1.8b, v0.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <8 x i8>, <8 x i8>* %px
@@ -158,9 +158,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
; CHECK-LABEL: v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i16>, <4 x i16>* %px
@@ -224,9 +224,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
; CHECK-LABEL: v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr b0, [x0]
-; CHECK-NEXT: ldr b1, [x1]
-; CHECK-NEXT: sqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr b0, [x1]
+; CHECK-NEXT: ldr b1, [x0]
+; CHECK-NEXT: sqadd v0.8b, v1.8b, v0.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
%x = load <1 x i8>, <1 x i8>* %px
@@ -239,9 +239,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
; CHECK-LABEL: v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: ldr h1, [x1]
-; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr h0, [x1]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
%x = load <1 x i16>, <1 x i16>* %px
@@ -254,10 +254,10 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; CHECK-LABEL: v16i4:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
-; CHECK-NEXT: sshr v1.16b, v1.16b, #4
+; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: sshr v0.16b, v0.16b, #4
+; CHECK-NEXT: sshr v1.16b, v1.16b, #4
; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 99ff626d8dd8..06fc023d927d 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -346,9 +346,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) {
; CHECK-LABEL: unsigned_sat_constant_v16i8_using_min:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.16b, #213
+; CHECK-NEXT: movi v2.16b, #42
; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: movi v1.16b, #42
-; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: add v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%c = icmp ult <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
%s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
@@ -383,9 +383,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) {
; CHECK-LABEL: unsigned_sat_constant_v8i16_using_min:
; CHECK: // %bb.0:
-; CHECK-NEXT: mvni v1.8h, #42
-; CHECK-NEXT: umin v0.8h, v0.8h, v1.8h
; CHECK-NEXT: movi v1.8h, #42
+; CHECK-NEXT: mvni v2.8h, #42
+; CHECK-NEXT: umin v0.8h, v0.8h, v2.8h
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%c = icmp ult <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index e473bbe72cef..56082bcb4c1b 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -165,8 +165,8 @@ define i64 @sel_shift_bool_i64(i1 %t) {
define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
; CHECK-LABEL: sel_shift_bool_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: movi v1.16b, #128
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -205,8 +205,8 @@ define <2 x i64> @sel_shift_bool_v2i64(<2 x i1> %t) {
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: mov w8, #65536
-; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: shl v0.2d, v0.2d, #63
+; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll
index 7991c797ff27..cb758f8a6202 100644
--- a/llvm/test/CodeGen/AArch64/signbit-shift.ll
+++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll
@@ -30,9 +30,9 @@ define <4 x i32> @add_zext_ifpos_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: add_zext_ifpos_vec_splat:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: movi v2.4s, #41
; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #41
-; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%e = zext <4 x i1> %c to <4 x i32>
@@ -79,9 +79,9 @@ define <4 x i32> @add_sext_ifpos_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: add_sext_ifpos_vec_splat:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: movi v2.4s, #42
; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #42
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%c = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%e = sext <4 x i1> %c to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
index d0f8d08a6526..0c1e61ff0640 100644
--- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
+++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
@@ -160,8 +160,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
%r = add <4 x i32> %t0, %b
@@ -172,8 +172,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
%r = add <4 x i32> %b, %t0
@@ -188,8 +188,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI14_0
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
%r = add <4 x i32> %t0, %b
@@ -200,8 +200,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI15_0
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
%r = add <4 x i32> %b, %t0
@@ -216,8 +216,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
%r = add <4 x i32> %t0, %b
@@ -228,8 +228,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI17_0
; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
%r = add <4 x i32> %b, %t0
@@ -244,8 +244,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI18_0
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
%r = sub <4 x i32> %t0, %b
@@ -256,8 +256,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI19_0
; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
%r = sub <4 x i32> %b, %t0
@@ -272,8 +272,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI20_0
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0]
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
%r = sub <4 x i32> %t0, %b
@@ -284,8 +284,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI21_0
; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
%r = sub <4 x i32> %b, %t0
@@ -300,8 +300,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI22_0
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_0]
+; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
%r = sub <4 x i32> %t0, %b
@@ -312,8 +312,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI23_0
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
%t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
%r = sub <4 x i32> %b, %t0
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index 98b5420b9475..4c8b0ab87e9c 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -305,10 +305,10 @@ define <4 x float> @fma(<4 x float> %x, <4 x float> *%y) {
; CHECK-NEXT: dup v1.4s, v1.s[3]
; CHECK-NEXT: .LBB9_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov v2.16b, v0.16b
; CHECK-NEXT: ldr q3, [x0]
-; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: subs w8, w8, #1
+; CHECK-NEXT: mov v2.16b, v0.16b
+; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: fmla v0.4s, v2.4s, v3.4s
; CHECK-NEXT: b.eq .LBB9_1
; CHECK-NEXT: // %bb.2: // %l2
diff --git a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
index 69cd4ee69733..7d041d4c5e75 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
@@ -5,20 +5,20 @@ define <16 x double> @test_sitofp_fixed(<16 x i32> %in) {
; CHECK-LABEL: test_sitofp_fixed:
; CHECK: ; %bb.0:
; CHECK-NEXT: sshll2.2d v4, v2, #0
-; CHECK-NEXT: sshll.2d v16, v1, #0
; CHECK-NEXT: sshll2.2d v5, v0, #0
; CHECK-NEXT: sshll2.2d v6, v1, #0
; CHECK-NEXT: sshll2.2d v7, v3, #0
; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: sshll.2d v16, v1, #0
; CHECK-NEXT: sshll.2d v17, v2, #0
; CHECK-NEXT: sshll.2d v18, v3, #0
; CHECK-NEXT: scvtf.2d v1, v5, #6
+; CHECK-NEXT: scvtf.2d v0, v0, #6
; CHECK-NEXT: scvtf.2d v3, v6, #6
; CHECK-NEXT: scvtf.2d v2, v16, #6
; CHECK-NEXT: scvtf.2d v5, v4, #6
-; CHECK-NEXT: scvtf.2d v0, v0, #6
-; CHECK-NEXT: scvtf.2d v7, v7, #6
; CHECK-NEXT: scvtf.2d v4, v17, #6
+; CHECK-NEXT: scvtf.2d v7, v7, #6
; CHECK-NEXT: scvtf.2d v6, v18, #6
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 43ce12809203..84e0979f6551 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -59,47 +59,47 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; CHECK-LABEL: test_srem_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x11, #7282
-; CHECK-NEXT: sbfx x10, x0, #0, #33
+; CHECK-NEXT: mov x8, #7282
+; CHECK-NEXT: sbfx x9, x0, #0, #33
+; CHECK-NEXT: movk x8, #29127, lsl #16
+; CHECK-NEXT: mov x11, #7281
+; CHECK-NEXT: movk x8, #50972, lsl #32
; CHECK-NEXT: movk x11, #29127, lsl #16
-; CHECK-NEXT: mov x9, #7281
+; CHECK-NEXT: movk x8, #7281, lsl #48
; CHECK-NEXT: movk x11, #50972, lsl #32
-; CHECK-NEXT: movk x9, #29127, lsl #16
+; CHECK-NEXT: sbfx x12, x1, #0, #33
+; CHECK-NEXT: sbfx x10, x2, #0, #33
+; CHECK-NEXT: smulh x13, x9, x8
; CHECK-NEXT: movk x11, #7281, lsl #48
-; CHECK-NEXT: movk x9, #50972, lsl #32
-; CHECK-NEXT: sbfx x13, x1, #0, #33
-; CHECK-NEXT: sbfx x8, x2, #0, #33
-; CHECK-NEXT: smulh x12, x10, x11
-; CHECK-NEXT: movk x9, #7281, lsl #48
-; CHECK-NEXT: smulh x11, x13, x11
-; CHECK-NEXT: smulh x9, x8, x9
-; CHECK-NEXT: add x12, x12, x12, lsr #63
-; CHECK-NEXT: sub x9, x9, x8
-; CHECK-NEXT: add x11, x11, x11, lsr #63
-; CHECK-NEXT: add x12, x12, x12, lsl #3
-; CHECK-NEXT: asr x14, x9, #3
-; CHECK-NEXT: sub x10, x10, x12
-; CHECK-NEXT: add x9, x14, x9, lsr #63
+; CHECK-NEXT: smulh x8, x12, x8
+; CHECK-NEXT: smulh x11, x10, x11
+; CHECK-NEXT: add x13, x13, x13, lsr #63
+; CHECK-NEXT: sub x11, x11, x10
+; CHECK-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NEXT: add x13, x13, x13, lsl #3
+; CHECK-NEXT: asr x14, x11, #3
+; CHECK-NEXT: sub x9, x9, x13
+; CHECK-NEXT: add x11, x14, x11, lsr #63
+; CHECK-NEXT: add x8, x8, x8, lsl #3
+; CHECK-NEXT: sub x8, x12, x8
; CHECK-NEXT: add x11, x11, x11, lsl #3
-; CHECK-NEXT: sub x11, x13, x11
-; CHECK-NEXT: add x9, x9, x9, lsl #3
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: add x8, x8, x9
+; CHECK-NEXT: fmov d0, x9
+; CHECK-NEXT: add x10, x10, x11
; CHECK-NEXT: mov x9, #8589934591
-; CHECK-NEXT: mov v0.d[1], x11
-; CHECK-NEXT: fmov d1, x8
+; CHECK-NEXT: adrp x11, .LCPI3_0
+; CHECK-NEXT: adrp x12, .LCPI3_1
+; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: fmov d1, x10
; CHECK-NEXT: dup v2.2d, x9
-; CHECK-NEXT: adrp x8, .LCPI3_0
-; CHECK-NEXT: adrp x9, .LCPI3_1
+; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI3_0]
+; CHECK-NEXT: ldr q4, [x12, :lo12:.LCPI3_1]
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1]
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: cmeq v1.2d, v1.2d, v3.2d
+; CHECK-NEXT: cmeq v0.2d, v0.2d, v3.2d
+; CHECK-NEXT: cmeq v1.2d, v1.2d, v4.2d
; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: mov w1, v0.s[1]
; CHECK-NEXT: fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
index 56f66a127d87..bd6145d1bca6 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
@@ -7,6 +7,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: adrp x9, .LCPI0_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1]
@@ -17,11 +18,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI0_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -39,12 +39,12 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #39321
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: adrp x10, .LCPI1_0
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI1_0]
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -60,12 +60,12 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #39321
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: adrp x10, .LCPI2_0
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI2_0]
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -83,17 +83,17 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #9362
; CHECK-NEXT: movk w8, #46811, lsl #16
; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: shl v0.4s, v2.4s, #31
; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -107,17 +107,17 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #9362
; CHECK-NEXT: movk w8, #46811, lsl #16
; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: shl v0.4s, v2.4s, #31
; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -131,6 +131,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI5_0
; CHECK-NEXT: adrp x9, .LCPI5_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; CHECK-NEXT: adrp x8, .LCPI5_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1]
@@ -141,11 +142,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI5_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -157,6 +157,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI6_0
; CHECK-NEXT: adrp x9, .LCPI6_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
; CHECK-NEXT: adrp x8, .LCPI6_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_1]
@@ -167,11 +168,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI6_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_4]
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -187,6 +187,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI7_0
; CHECK-NEXT: adrp x9, .LCPI7_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
; CHECK-NEXT: adrp x8, .LCPI7_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_1]
@@ -197,11 +198,10 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI7_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -215,6 +215,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI8_0
; CHECK-NEXT: adrp x9, .LCPI8_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
; CHECK-NEXT: adrp x8, .LCPI8_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1]
@@ -225,11 +226,10 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI8_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -243,6 +243,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI9_0
; CHECK-NEXT: adrp x9, .LCPI9_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
; CHECK-NEXT: adrp x8, .LCPI9_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_1]
@@ -253,11 +254,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI9_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -275,12 +275,12 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #39321
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: adrp x10, .LCPI10_0
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI10_0]
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -298,17 +298,17 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #9362
; CHECK-NEXT: movk w8, #46811, lsl #16
; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: adrp x8, .LCPI11_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: shl v0.4s, v2.4s, #31
; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -322,6 +322,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: adrp x9, .LCPI12_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: adrp x8, .LCPI12_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_1]
@@ -332,11 +333,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI12_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -441,6 +441,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
; CHECK-NEXT: adrp x9, .LCPI16_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: adrp x8, .LCPI16_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_1]
@@ -451,11 +452,10 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI16_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -469,6 +469,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI17_0
; CHECK-NEXT: adrp x9, .LCPI17_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
; CHECK-NEXT: adrp x8, .LCPI17_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_1]
@@ -479,11 +480,10 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI17_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -497,6 +497,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI18_0
; CHECK-NEXT: adrp x9, .LCPI18_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
; CHECK-NEXT: adrp x8, .LCPI18_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_1]
@@ -507,11 +508,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-NEXT: adrp x8, .LCPI18_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -529,12 +529,12 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #39321
; CHECK-NEXT: movk w8, #52428, lsl #16
; CHECK-NEXT: movk w9, #6553, lsl #16
-; CHECK-NEXT: adrp x10, .LCPI19_0
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: adrp x8, .LCPI19_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI19_0]
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -552,17 +552,17 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #9362
; CHECK-NEXT: movk w8, #46811, lsl #16
; CHECK-NEXT: movk w9, #4681, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: adrp x8, .LCPI20_0
; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-NEXT: shl v0.4s, v2.4s, #31
; CHECK-NEXT: ushr v1.4s, v2.4s, #1
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -576,6 +576,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI21_0
; CHECK-NEXT: adrp x9, .LCPI21_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
; CHECK-NEXT: adrp x8, .LCPI21_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_1]
@@ -586,11 +587,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI21_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -606,6 +606,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI22_0
; CHECK-NEXT: adrp x9, .LCPI22_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
; CHECK-NEXT: adrp x8, .LCPI22_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_1]
@@ -616,11 +617,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI22_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -634,6 +634,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI23_0
; CHECK-NEXT: adrp x9, .LCPI23_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
; CHECK-NEXT: adrp x8, .LCPI23_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_1]
@@ -644,11 +645,10 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI23_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -662,6 +662,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI24_0
; CHECK-NEXT: adrp x9, .LCPI24_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0]
; CHECK-NEXT: adrp x8, .LCPI24_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_1]
@@ -672,11 +673,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI24_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -691,6 +691,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI25_0
; CHECK-NEXT: adrp x9, .LCPI25_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0]
; CHECK-NEXT: adrp x8, .LCPI25_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_1]
@@ -701,11 +702,10 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-NEXT: adrp x8, .LCPI25_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -718,6 +718,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI26_0
; CHECK-NEXT: adrp x9, .LCPI26_1
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0]
; CHECK-NEXT: adrp x8, .LCPI26_2
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_1]
@@ -728,11 +729,10 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-NEXT: adrp x8, .LCPI26_4
; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s
; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_4]
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_4]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index c37e5450160f..fc033bc741c1 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -33,6 +33,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #47184
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: movk w9, #1310, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: mov w8, #23592
@@ -40,11 +41,10 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
; CHECK-NEXT: movk w8, #655, lsl #16
; CHECK-NEXT: shl v0.4s, v2.4s, #30
; CHECK-NEXT: ushr v1.4s, v2.4s, #2
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -86,6 +86,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w9, #47184
; CHECK-NEXT: movk w8, #49807, lsl #16
; CHECK-NEXT: movk w9, #1310, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: dup v2.4s, w9
; CHECK-NEXT: mov w8, #23592
@@ -93,11 +94,10 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-NEXT: movk w8, #655, lsl #16
; CHECK-NEXT: shl v0.4s, v2.4s, #30
; CHECK-NEXT: ushr v1.4s, v2.4s, #2
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%srem = srem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100>
%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -114,15 +114,15 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
-; CHECK-NEXT: movi v1.4s, #25
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: sshr v3.4s, v2.4s, #3
-; CHECK-NEXT: usra v3.4s, v2.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT: movi v3.4s, #25
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: sshr v2.4s, v1.4s, #3
+; CHECK-NEXT: usra v2.4s, v1.4s, #31
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -137,15 +137,15 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
-; CHECK-NEXT: movi v1.4s, #100
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: sshr v3.4s, v2.4s, #5
-; CHECK-NEXT: usra v3.4s, v2.4s, #31
-; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s
+; CHECK-NEXT: movi v3.4s, #100
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: sshr v2.4s, v1.4s, #5
+; CHECK-NEXT: usra v2.4s, v1.4s, #31
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -184,12 +184,12 @@ define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind {
define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_pow2:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT: mov v3.16b, v0.16b
+; CHECK-NEXT: cmlt v3.4s, v0.4s, #0
+; CHECK-NEXT: mov v2.16b, v0.16b
+; CHECK-NEXT: usra v2.4s, v3.4s, #28
; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: usra v3.4s, v2.4s, #28
-; CHECK-NEXT: bic v3.4s, #15
-; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: bic v2.4s, #15
+; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
@@ -204,10 +204,10 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_srem_int_min:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT: mov v3.16b, v0.16b
-; CHECK-NEXT: movi v1.4s, #128, lsl #24
-; CHECK-NEXT: usra v3.4s, v2.4s, #1
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: mov v1.16b, v0.16b
+; CHECK-NEXT: movi v3.4s, #128, lsl #24
+; CHECK-NEXT: usra v1.4s, v2.4s, #1
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 95e56df351da..74c7a55c8be7 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -98,9 +98,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
; CHECK-LABEL: v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: sqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: sqsub v0.8b, v1.8b, v0.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <8 x i8>, <8 x i8>* %px
@@ -159,9 +159,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
; CHECK-LABEL: v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i16>, <4 x i16>* %px
@@ -225,9 +225,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
; CHECK-LABEL: v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr b0, [x0]
-; CHECK-NEXT: ldr b1, [x1]
-; CHECK-NEXT: sqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr b0, [x1]
+; CHECK-NEXT: ldr b1, [x0]
+; CHECK-NEXT: sqsub v0.8b, v1.8b, v0.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
%x = load <1 x i8>, <1 x i8>* %px
@@ -240,9 +240,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
; CHECK-LABEL: v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: ldr h1, [x1]
-; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr h0, [x1]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
%x = load <1 x i16>, <1 x i16>* %px
@@ -255,10 +255,10 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; CHECK-LABEL: v16i4:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
-; CHECK-NEXT: sshr v1.16b, v1.16b, #4
+; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: sshr v0.16b, v0.16b, #4
+; CHECK-NEXT: sshr v1.16b, v1.16b, #4
; CHECK-NEXT: shl v1.16b, v1.16b, #4
; CHECK-NEXT: shl v0.16b, v0.16b, #4
; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index f95860d55a40..8b1bae5009a1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -52,11 +52,11 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_EQ_128-LABEL: sdiv_v8i8:
; VBITS_EQ_128: sshll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: sshll v0.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: sunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h
; VBITS_EQ_128-NEXT: sunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT: sunpkhi z3.s, z0.h
; VBITS_EQ_128-NEXT: sunpklo z0.s, z0.h
; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
@@ -350,8 +350,8 @@ define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: sdiv_v4i16:
; CHECK: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
; CHECK-NEXT: mov w8, v1.s[1]
; CHECK-NEXT: mov w9, v1.s[2]
@@ -364,8 +364,8 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_EQ_128-LABEL: sdiv_v4i16:
; VBITS_EQ_128: sshll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: sshll v0.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
; VBITS_EQ_128-NEXT: xtn v0.4h, v0.4s
; VBITS_EQ_128-NEXT: ret
@@ -744,11 +744,11 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_EQ_128-LABEL: udiv_v8i8:
; VBITS_EQ_128: ushll v1.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: ushll v0.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: uunpkhi z2.s, z1.h
-; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h
; VBITS_EQ_128-NEXT: uunpklo z1.s, z1.h
+; VBITS_EQ_128-NEXT: uunpkhi z3.s, z0.h
; VBITS_EQ_128-NEXT: uunpklo z0.s, z0.h
; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
@@ -1040,8 +1040,8 @@ define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: udiv_v4i16:
; CHECK: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s
; CHECK-NEXT: mov w8, v1.s[1]
; CHECK-NEXT: mov w9, v1.s[2]
@@ -1054,8 +1054,8 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_EQ_128-LABEL: udiv_v4i16:
; VBITS_EQ_128: ushll v1.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: ushll v0.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s
; VBITS_EQ_128-NEXT: xtn v0.4h, v0.4s
; VBITS_EQ_128-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index 710575a54477..c83e6ded4841 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -288,16 +288,16 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-LABEL: smulh_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
-; CHECK-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
; VBITS_EQ_128-LABEL: smulh_v2i32:
; VBITS_EQ_128: sshll v0.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0
; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT: sshll v1.2d, v1.2s, #0
; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
; VBITS_EQ_128-NEXT: ret
@@ -785,16 +785,16 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
; CHECK-LABEL: umulh_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: shrn v0.2s, v0.2d, #32
; CHECK-NEXT: ret
; VBITS_EQ_128-LABEL: umulh_v2i32:
; VBITS_EQ_128: ushll v0.2d, v0.2s, #0
-; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0
; VBITS_EQ_128-NEXT: ptrue p0.d, vl2
+; VBITS_EQ_128-NEXT: ushll v1.2d, v1.2s, #0
; VBITS_EQ_128-NEXT: mul z0.d, p0/m, z0.d, z1.d
; VBITS_EQ_128-NEXT: shrn v0.2s, v0.2d, #32
; VBITS_EQ_128-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 3626aa915541..1a7774bd1174 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -53,11 +53,11 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_EQ_128-LABEL: srem_v8i8:
; VBITS_EQ_128: sshll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: sshll v3.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: sshll v3.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: sunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT: sunpkhi z5.s, z3.h
; VBITS_EQ_128-NEXT: sunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT: sunpkhi z5.s, z3.h
; VBITS_EQ_128-NEXT: sunpklo z3.s, z3.h
; VBITS_EQ_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
@@ -364,8 +364,8 @@ define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: srem_v4i16:
; CHECK: sshll v2.4s, v1.4h, #0
-; CHECK-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
@@ -379,8 +379,8 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_EQ_128-LABEL: srem_v4i16:
; VBITS_EQ_128: sshll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: sshll v3.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: sshll v3.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
; VBITS_EQ_128-NEXT: xtn v2.4h, v2.4s
; VBITS_EQ_128-NEXT: mls v0.4h, v2.4h, v1.4h
@@ -812,11 +812,11 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
; VBITS_EQ_128-LABEL: urem_v8i8:
; VBITS_EQ_128: ushll v2.8h, v1.8b, #0
-; VBITS_EQ_128-NEXT: ushll v3.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: ushll v3.8h, v0.8b, #0
; VBITS_EQ_128-NEXT: uunpkhi z4.s, z2.h
-; VBITS_EQ_128-NEXT: uunpkhi z5.s, z3.h
; VBITS_EQ_128-NEXT: uunpklo z2.s, z2.h
+; VBITS_EQ_128-NEXT: uunpkhi z5.s, z3.h
; VBITS_EQ_128-NEXT: uunpklo z3.s, z3.h
; VBITS_EQ_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s
; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
@@ -1121,8 +1121,8 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; CHECK-LABEL: urem_v4i16:
; CHECK: ushll v2.4s, v1.4h, #0
-; CHECK-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
+; CHECK-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s
; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1]
; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2]
@@ -1136,8 +1136,8 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
; VBITS_EQ_128-LABEL: urem_v4i16:
; VBITS_EQ_128: ushll v2.4s, v1.4h, #0
-; VBITS_EQ_128-NEXT: ushll v3.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: ptrue p0.s, vl4
+; VBITS_EQ_128-NEXT: ushll v3.4s, v0.4h, #0
; VBITS_EQ_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s
; VBITS_EQ_128-NEXT: xtn v2.4h, v2.4s
; VBITS_EQ_128-NEXT: mls v0.4h, v2.4h, v1.4h
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index a2dc244c848b..44bb3674ee1c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function masked_scatter_v8i8,masked_scatter_v8i16,masked_scatter_v8i32,masked_scatter_v8i64 --prefix VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK
@@ -85,9 +86,9 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8
; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8
; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8
; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8
+; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h
; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h
; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s
@@ -99,7 +100,6 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
; VBITS_EQ_256-NEXT: st1b { z1.d }, p1, [z4.d]
; VBITS_EQ_256-NEXT: st1b { z0.d }, p0, [z3.d]
; VBITS_EQ_256-NEXT: ret
-;
; VBITS_GE_512-LABEL: masked_scatter_v8i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ldr d0, [x0]
@@ -108,8 +108,8 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
; VBITS_GE_512-NEXT: cmeq v2.8b, v0.8b, #0
; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_512-NEXT: sunpklo z2.h, z2.b
+; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0
@@ -131,8 +131,8 @@ define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
; VBITS_GE_1024-NEXT: cmeq v2.16b, v0.16b, #0
; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: sunpklo z2.h, z2.b
+; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h
; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s
; VBITS_GE_1024-NEXT: cmpne p0.d, p0/z, z2.d, #0
@@ -226,8 +226,8 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
; VBITS_EQ_256-NEXT: sunpklo z2.s, z1.h
+; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s
; VBITS_EQ_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s
; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0
@@ -240,7 +240,6 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
; VBITS_EQ_256-NEXT: uunpklo z1.d, z3.s
; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [z4.d]
; VBITS_EQ_256-NEXT: ret
-;
; VBITS_GE_512-LABEL: masked_scatter_v8i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ldr q0, [x0]
@@ -369,7 +368,6 @@ define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
; VBITS_EQ_256-NEXT: st1w { z1.d }, p0, [z3.d]
; VBITS_EQ_256-NEXT: st1w { z0.d }, p1, [z2.d]
; VBITS_EQ_256-NEXT: ret
-;
; VBITS_GE_512-LABEL: masked_scatter_v8i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl8
@@ -455,10 +453,10 @@ define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldr q2, [x1]
-; CHECK-NEXT: cmeq v1.2d, v0.2d, #0
-; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT: st1d { z0.d }, p0, [z2.d]
+; CHECK-NEXT: ldr q1, [x1]
+; CHECK-NEXT: cmeq v2.2d, v0.2d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0
+; CHECK-NEXT: st1d { z0.d }, p0, [z1.d]
; CHECK-NEXT: ret
%vals = load <2 x i64>, <2 x i64>* %a
%ptrs = load <2 x i64*>, <2 x i64*>* %b
@@ -498,7 +496,6 @@ define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [z3.d]
; VBITS_EQ_256-NEXT: st1d { z0.d }, p1, [z2.d]
; VBITS_EQ_256-NEXT: ret
-;
; VBITS_GE_512-LABEL: masked_scatter_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
index 2c13eea4ca4f..19ebd4265bd6 100644
--- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
@@ -15,8 +15,8 @@ define void @func_vscale_none(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-NOARG-NEXT: ldp q6, q4, [x1]
; CHECK-NOARG-NEXT: stp q0, q1, [x0, #32]
; CHECK-NOARG-NEXT: add v2.4s, v2.4s, v6.4s
-; CHECK-NOARG-NEXT: add v3.4s, v3.4s, v4.4s
-; CHECK-NOARG-NEXT: stp q2, q3, [x0]
+; CHECK-NOARG-NEXT: add v0.4s, v3.4s, v4.4s
+; CHECK-NOARG-NEXT: stp q2, q0, [x0]
; CHECK-NOARG-NEXT: ret
;
; CHECK-ARG-LABEL: func_vscale_none:
@@ -47,8 +47,8 @@ define void @func_vscale1_1(<16 x i32>* %a, <16 x i32>* %b) #1 {
; CHECK-NEXT: ldp q6, q4, [x1]
; CHECK-NEXT: stp q0, q1, [x0, #32]
; CHECK-NEXT: add v2.4s, v2.4s, v6.4s
-; CHECK-NEXT: add v3.4s, v3.4s, v4.4s
-; CHECK-NEXT: stp q2, q3, [x0]
+; CHECK-NEXT: add v0.4s, v3.4s, v4.4s
+; CHECK-NEXT: stp q2, q0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index f8298485f35b..46dd3db9e97f 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -97,9 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
; CHECK-LABEL: v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: uqadd v0.8b, v1.8b, v0.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <8 x i8>, <8 x i8>* %px
@@ -158,9 +158,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
; CHECK-LABEL: v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: uqadd v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i16>, <4 x i16>* %px
@@ -225,9 +225,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
; CHECK-LABEL: v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr b0, [x0]
-; CHECK-NEXT: ldr b1, [x1]
-; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr b0, [x1]
+; CHECK-NEXT: ldr b1, [x0]
+; CHECK-NEXT: uqadd v0.8b, v1.8b, v0.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
%x = load <1 x i8>, <1 x i8>* %px
@@ -240,9 +240,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
; CHECK-LABEL: v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: ldr h1, [x1]
-; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr h0, [x1]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: uqadd v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
%x = load <1 x i16>, <1 x i16>* %px
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
index 334744116a8e..e7f7e1375687 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
@@ -67,25 +67,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: adrp x8, .LCPI4_0
+; CHECK-NEXT: adrp x9, .LCPI4_1
; CHECK-NEXT: mov v0.h[1], w1
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT: adrp x8, .LCPI4_1
+; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT: adrp x8, .LCPI4_2
; CHECK-NEXT: mov v0.h[2], w2
; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT: adrp x8, .LCPI4_2
-; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-NEXT: movi d1, #0x0000000000ffff
-; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT: mul v0.4h, v0.4h, v2.4h
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2]
; CHECK-NEXT: adrp x8, .LCPI4_3
-; CHECK-NEXT: shl v2.4h, v0.4h, #1
+; CHECK-NEXT: shl v3.4h, v0.4h, #1
; CHECK-NEXT: bic v0.4h, #248, lsl #8
-; CHECK-NEXT: ushl v2.4h, v2.4h, v3.4h
; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: ushl v1.4h, v3.4h, v2.4h
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: bic v0.4h, #248, lsl #8
-; CHECK-NEXT: cmhi v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: cmhi v0.4h, v0.4h, v2.4h
; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: umov w1, v0.h[1]
; CHECK-NEXT: umov w2, v0.h[2]
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
index 4fe59ff23771..a2a27a1508a1 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -7,6 +7,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: adrp x9, .LCPI0_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_2]
@@ -15,11 +16,10 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI0_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -34,13 +34,13 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_allones_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: adrp x8, .LCPI1_1
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_1]
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -51,13 +51,13 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_allones_ne:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: adrp x8, .LCPI2_1
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 4294967295, i32 5>
%cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -71,6 +71,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: adrp x9, .LCPI3_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: adrp x8, .LCPI3_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_2]
@@ -79,11 +80,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI3_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -95,6 +95,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: adrp x9, .LCPI4_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: adrp x8, .LCPI4_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_2]
@@ -103,11 +104,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI4_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
%cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -121,6 +121,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI5_0
; CHECK-NEXT: adrp x9, .LCPI5_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; CHECK-NEXT: adrp x8, .LCPI5_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_2]
@@ -129,11 +130,10 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI5_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -145,6 +145,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI6_0
; CHECK-NEXT: adrp x9, .LCPI6_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
; CHECK-NEXT: adrp x8, .LCPI6_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_2]
@@ -153,11 +154,10 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI6_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_3]
-; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
%cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -173,6 +173,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI7_0
; CHECK-NEXT: adrp x9, .LCPI7_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0]
; CHECK-NEXT: adrp x8, .LCPI7_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_2]
@@ -181,11 +182,10 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI7_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -199,6 +199,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI8_0
; CHECK-NEXT: adrp x9, .LCPI8_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
; CHECK-NEXT: adrp x8, .LCPI8_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_2]
@@ -207,11 +208,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI8_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -225,6 +225,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI9_0
; CHECK-NEXT: adrp x9, .LCPI9_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0]
; CHECK-NEXT: adrp x8, .LCPI9_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_2]
@@ -233,11 +234,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI9_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -252,14 +252,14 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_one:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #52429
-; CHECK-NEXT: adrp x9, .LCPI10_0
; CHECK-NEXT: movk w8, #52428, lsl #16
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI10_0]
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -273,16 +273,16 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #28087
; CHECK-NEXT: movk w8, #46811, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: adrp x8, .LCPI11_0
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0]
; CHECK-NEXT: shl v1.4s, v0.4s, #31
; CHECK-NEXT: ushr v0.4s, v0.4s, #1
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 14, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -296,6 +296,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: adrp x9, .LCPI12_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: adrp x8, .LCPI12_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_2]
@@ -304,11 +305,10 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI12_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -324,6 +324,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI13_0
; CHECK-NEXT: adrp x9, .LCPI13_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
; CHECK-NEXT: adrp x8, .LCPI13_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI13_2]
@@ -332,11 +333,10 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI13_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -350,6 +350,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI14_0
; CHECK-NEXT: adrp x9, .LCPI14_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
; CHECK-NEXT: adrp x8, .LCPI14_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_2]
@@ -358,11 +359,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI14_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -376,6 +376,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI15_0
; CHECK-NEXT: adrp x9, .LCPI15_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
; CHECK-NEXT: adrp x8, .LCPI15_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI15_2]
@@ -384,11 +385,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI15_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -404,6 +404,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI16_0
; CHECK-NEXT: adrp x9, .LCPI16_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
; CHECK-NEXT: adrp x8, .LCPI16_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_2]
@@ -412,11 +413,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI16_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -430,6 +430,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI17_0
; CHECK-NEXT: adrp x9, .LCPI17_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
; CHECK-NEXT: adrp x8, .LCPI17_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_2]
@@ -438,11 +439,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI17_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -456,6 +456,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI18_0
; CHECK-NEXT: adrp x9, .LCPI18_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0]
; CHECK-NEXT: adrp x8, .LCPI18_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_2]
@@ -464,11 +465,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-NEXT: adrp x8, .LCPI18_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -483,13 +483,13 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_allones_and_one:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI19_0
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
; CHECK-NEXT: adrp x8, .LCPI19_1
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_1]
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -503,6 +503,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI20_0
; CHECK-NEXT: adrp x9, .LCPI20_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0]
; CHECK-NEXT: adrp x8, .LCPI20_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI20_2]
@@ -511,11 +512,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI20_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -529,6 +529,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI21_0
; CHECK-NEXT: adrp x9, .LCPI21_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
; CHECK-NEXT: adrp x8, .LCPI21_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_2]
@@ -537,11 +538,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI21_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -557,6 +557,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI22_0
; CHECK-NEXT: adrp x9, .LCPI22_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
; CHECK-NEXT: adrp x8, .LCPI22_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_2]
@@ -565,11 +566,10 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI22_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -583,6 +583,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI23_0
; CHECK-NEXT: adrp x9, .LCPI23_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
; CHECK-NEXT: adrp x8, .LCPI23_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_2]
@@ -591,11 +592,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI23_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -609,6 +609,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI24_0
; CHECK-NEXT: adrp x9, .LCPI24_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0]
; CHECK-NEXT: adrp x8, .LCPI24_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_2]
@@ -617,11 +618,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-NEXT: adrp x8, .LCPI24_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -636,6 +636,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI25_0
; CHECK-NEXT: adrp x9, .LCPI25_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0]
; CHECK-NEXT: adrp x8, .LCPI25_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_2]
@@ -644,11 +645,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-NEXT: adrp x8, .LCPI25_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -661,6 +661,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI26_0
; CHECK-NEXT: adrp x9, .LCPI26_2
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0]
; CHECK-NEXT: adrp x8, .LCPI26_1
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_2]
@@ -669,11 +670,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-NEXT: adrp x8, .LCPI26_3
; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3]
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_3]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index c59ff8258696..a989eaa37c11 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -54,11 +54,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
; CHECK-NEXT: mov w8, #43690
; CHECK-NEXT: movk w8, #10922, lsl #16
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: shl v1.4s, v0.4s, #31
; CHECK-NEXT: ushr v0.4s, v0.4s, #1
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
@@ -70,18 +70,18 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
; CHECK-LABEL: t32_6_part1:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: mov w9, #43691
+; CHECK-NEXT: movk w9, #43690, lsl #16
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: mov w8, #43691
-; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: adrp x8, .LCPI3_1
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v2.4s, w9
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
; CHECK-NEXT: shl v1.4s, v0.4s, #31
; CHECK-NEXT: ushr v0.4s, v0.4s, #1
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index e5b21fcd6553..19c59975d7de 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -7,14 +7,14 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: movk w8, #49807, lsl #16
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: mov w8, #28835
; CHECK-NEXT: movk w8, #2621, lsl #16
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 25, i32 25, i32 25, i32 25>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -28,17 +28,17 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #23593
; CHECK-NEXT: movk w8, #49807, lsl #16
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: mov w8, #23592
; CHECK-NEXT: movk w8, #655, lsl #16
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v2.4s, w8
; CHECK-NEXT: shl v1.4s, v0.4s, #30
; CHECK-NEXT: ushr v0.4s, v0.4s, #2
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 100, i32 100, i32 100, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -53,13 +53,13 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_neg25:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI2_0
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: adrp x8, .LCPI2_1
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 25, i32 -25, i32 -25, i32 25>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -72,16 +72,16 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_even_neg100:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: movi v3.4s, #1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: adrp x8, .LCPI3_1
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
; CHECK-NEXT: shl v1.4s, v0.4s, #30
; CHECK-NEXT: ushr v0.4s, v0.4s, #2
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 -100, i32 100, i32 -100, i32 100>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -98,13 +98,13 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
-; CHECK-NEXT: movi v1.4s, #25
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: ushr v2.4s, v2.4s, #3
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: movi v2.4s, #25
+; CHECK-NEXT: ushr v1.4s, v1.4s, #3
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -120,13 +120,13 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #34079
; CHECK-NEXT: movk w8, #20971, lsl #16
-; CHECK-NEXT: movi v1.4s, #100
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
-; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT: ushr v2.4s, v2.4s, #5
-; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: movi v2.4s, #100
+; CHECK-NEXT: ushr v1.4s, v1.4s, #5
+; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
@@ -167,10 +167,10 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_pow2:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #15
+; CHECK-NEXT: movi v2.4s, #1
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -182,8 +182,8 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_int_min(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_int_min:
; CHECK: // %bb.0:
-; CHECK-NEXT: bic v0.4s, #128, lsl #24
; CHECK-NEXT: movi v1.4s, #1
+; CHECK-NEXT: bic v0.4s, #128, lsl #24
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
index 30574505998d..be08ee4c893b 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
@@ -5,11 +5,11 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
; CHECK-LABEL: t0_all_tautological:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: adrp x9, .LCPI0_1
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT: adrp x8, .LCPI0_1
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1]
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_1]
-; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 5e7f7d350f33..666e5a613433 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -98,9 +98,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
define void @v8i8(<8 x i8>* %px, <8 x i8>* %py, <8 x i8>* %pz) nounwind {
; CHECK-LABEL: v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: uqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: uqsub v0.8b, v1.8b, v0.8b
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <8 x i8>, <8 x i8>* %px
@@ -155,9 +155,9 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
define void @v4i16(<4 x i16>* %px, <4 x i16>* %py, <4 x i16>* %pz) nounwind {
; CHECK-LABEL: v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr d0, [x1]
+; CHECK-NEXT: ldr d1, [x0]
+; CHECK-NEXT: uqsub v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str d0, [x2]
; CHECK-NEXT: ret
%x = load <4 x i16>, <4 x i16>* %px
@@ -220,9 +220,9 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
; CHECK-LABEL: v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr b0, [x0]
-; CHECK-NEXT: ldr b1, [x1]
-; CHECK-NEXT: uqsub v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ldr b0, [x1]
+; CHECK-NEXT: ldr b1, [x0]
+; CHECK-NEXT: uqsub v0.8b, v1.8b, v0.8b
; CHECK-NEXT: st1 { v0.b }[0], [x2]
; CHECK-NEXT: ret
%x = load <1 x i8>, <1 x i8>* %px
@@ -235,9 +235,9 @@ define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
; CHECK-LABEL: v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr h0, [x0]
-; CHECK-NEXT: ldr h1, [x1]
-; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ldr h0, [x1]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: uqsub v0.4h, v1.4h, v0.4h
; CHECK-NEXT: str h0, [x2]
; CHECK-NEXT: ret
%x = load <1 x i16>, <1 x i16>* %px
diff --git a/llvm/test/CodeGen/AArch64/vec_cttz.ll b/llvm/test/CodeGen/AArch64/vec_cttz.ll
index 1cd7f3408233..231790fc2121 100644
--- a/llvm/test/CodeGen/AArch64/vec_cttz.ll
+++ b/llvm/test/CodeGen/AArch64/vec_cttz.ll
@@ -85,8 +85,8 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %a) nounwind {
; CHECK-NEXT: movi v1.8h, #1
; CHECK-NEXT: sub v1.8h, v0.8h, v1.8h
; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: movi v1.8h, #16
; CHECK-NEXT: clz v0.8h, v0.8h
+; CHECK-NEXT: movi v1.8h, #16
; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h
; CHECK-NEXT: ret
%b = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
@@ -99,8 +99,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %a) nounwind {
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s
; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: movi v1.4s, #32
; CHECK-NEXT: clz v0.4s, v0.4s
+; CHECK-NEXT: movi v1.4s, #32
; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ret
%b = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index f75d247e88c3..516f0297b462 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -52,8 +52,8 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; CHECK-NEXT: add v1.4s, v0.4s, v1.4s
; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: st1 { v1.s }[2], [x8]
; CHECK-NEXT: str d1, [x0]
+; CHECK-NEXT: st1 { v1.s }[2], [x8]
; CHECK-NEXT: ret
%t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
%val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -84,27 +84,27 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
; CHECK-NEXT: fmov s0, w6
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: add x9, sp, #8
; CHECK-NEXT: ldr s2, [sp, #16]
-; CHECK-NEXT: fmov s3, w4
+; CHECK-NEXT: add x9, sp, #24
+; CHECK-NEXT: add x10, sp, #8
; CHECK-NEXT: mov v0.s[1], w7
+; CHECK-NEXT: fmov s3, w4
; CHECK-NEXT: mov v1.s[1], w1
+; CHECK-NEXT: ld1 { v2.s }[1], [x9]
; CHECK-NEXT: mov v3.s[1], w5
; CHECK-NEXT: ld1 { v0.s }[2], [x8]
-; CHECK-NEXT: add x8, sp, #24
; CHECK-NEXT: mov v1.s[2], w2
-; CHECK-NEXT: ld1 { v2.s }[1], [x8]
-; CHECK-NEXT: ld1 { v0.s }[3], [x9]
-; CHECK-NEXT: mov v1.s[3], w3
; CHECK-NEXT: ldr x8, [sp, #32]
; CHECK-NEXT: add v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s
+; CHECK-NEXT: ld1 { v0.s }[3], [x10]
+; CHECK-NEXT: mov v1.s[3], w3
; CHECK-NEXT: str d2, [x8, #16]
-; CHECK-NEXT: cmhi v1.4s, v1.4s, v0.4s
-; CHECK-NEXT: str q0, [x8]
+; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s
; CHECK-NEXT: mov w5, v3.s[1]
; CHECK-NEXT: fmov w4, s3
+; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: cmhi v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: mov w1, v1.s[1]
; CHECK-NEXT: mov w2, v1.s[2]
; CHECK-NEXT: mov w3, v1.s[3]
@@ -141,23 +141,23 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; CHECK-NEXT: add v4.16b, v0.16b, v1.16b
; CHECK-NEXT: cmhi v0.16b, v0.16b, v4.16b
; CHECK-NEXT: str q4, [x0]
-; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
-; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip2 v2.8b, v0.8b, v0.8b
+; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b
-; CHECK-NEXT: zip2 v1.8b, v1.8b, v0.8b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip2 v5.8b, v0.8b, v0.8b
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: shl v2.4s, v2.4s, #31
+; CHECK-NEXT: cmlt v0.4s, v1.4s, #0
+; CHECK-NEXT: cmlt v1.4s, v2.4s, #0
+; CHECK-NEXT: ushll v2.4s, v3.4h, #0
+; CHECK-NEXT: ushll v3.4s, v5.4h, #0
; CHECK-NEXT: shl v2.4s, v2.4s, #31
-; CHECK-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: shl v5.4s, v0.4s, #31
-; CHECK-NEXT: cmlt v0.4s, v2.4s, #0
; CHECK-NEXT: shl v3.4s, v3.4s, #31
-; CHECK-NEXT: shl v6.4s, v1.4s, #31
-; CHECK-NEXT: cmlt v1.4s, v5.4s, #0
-; CHECK-NEXT: cmlt v2.4s, v3.4s, #0
-; CHECK-NEXT: cmlt v3.4s, v6.4s, #0
+; CHECK-NEXT: cmlt v2.4s, v2.4s, #0
+; CHECK-NEXT: cmlt v3.4s, v3.4s, #0
; CHECK-NEXT: ret
%t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
%val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -213,26 +213,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; CHECK-NEXT: bic v1.4s, #255, lsl #24
; CHECK-NEXT: bic v0.4s, #255, lsl #24
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov w8, v0.s[3]
; CHECK-NEXT: mov w9, v0.s[2]
; CHECK-NEXT: mov w10, v0.s[1]
; CHECK-NEXT: fmov w11, s0
+; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: bic v1.4s, #255, lsl #24
; CHECK-NEXT: sturh w8, [x0, #9]
; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s
; CHECK-NEXT: strh w9, [x0, #6]
; CHECK-NEXT: sturh w10, [x0, #3]
; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: strh w11, [x0]
-; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: lsr w10, w10, #16
; CHECK-NEXT: strb w8, [x0, #11]
-; CHECK-NEXT: lsr w8, w10, #16
-; CHECK-NEXT: lsr w10, w11, #16
-; CHECK-NEXT: strb w9, [x0, #8]
+; CHECK-NEXT: lsr w8, w11, #16
+; CHECK-NEXT: strh w11, [x0]
; CHECK-NEXT: mvn v0.16b, v1.16b
-; CHECK-NEXT: strb w8, [x0, #5]
-; CHECK-NEXT: strb w10, [x0, #2]
+; CHECK-NEXT: strb w9, [x0, #8]
+; CHECK-NEXT: strb w10, [x0, #5]
+; CHECK-NEXT: strb w8, [x0, #2]
; CHECK-NEXT: ret
%t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
%val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
@@ -249,20 +249,20 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: and v1.8b, v0.8b, v2.8b
; CHECK-NEXT: umov w8, v0.h[1]
; CHECK-NEXT: umov w9, v0.h[2]
; CHECK-NEXT: umov w10, v0.h[0]
; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: cmeq v1.4h, v1.4h, v0.4h
+; CHECK-NEXT: and v1.8b, v0.8b, v2.8b
+; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
; CHECK-NEXT: and w8, w8, #0x1
; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: bfi w10, w8, #1, #1
-; CHECK-NEXT: mvn v1.8b, v1.8b
; CHECK-NEXT: bfi w10, w9, #2, #1
; CHECK-NEXT: bfi w10, w11, #3, #29
; CHECK-NEXT: and w8, w10, #0xf
-; CHECK-NEXT: sshll v0.4s, v1.4h, #0
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index d305b688f3af..602e4c727ef1 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -22,8 +22,8 @@ define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) noun
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: shrn v0.2s, v1.2d, #32
; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: str s1, [x0]
; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: str s1, [x0]
; CHECK-NEXT: ret
%t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
%val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
@@ -39,8 +39,8 @@ define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) noun
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: shrn v0.2s, v1.2d, #32
; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: ret
%t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
%val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
@@ -96,37 +96,37 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
; CHECK-NEXT: fmov s0, w6
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: add x9, sp, #8
; CHECK-NEXT: ldr s2, [sp, #16]
-; CHECK-NEXT: fmov s3, w4
+; CHECK-NEXT: add x9, sp, #24
+; CHECK-NEXT: add x10, sp, #8
; CHECK-NEXT: mov v0.s[1], w7
+; CHECK-NEXT: fmov s3, w4
; CHECK-NEXT: mov v1.s[1], w1
+; CHECK-NEXT: ld1 { v2.s }[1], [x9]
; CHECK-NEXT: mov v3.s[1], w5
; CHECK-NEXT: ld1 { v0.s }[2], [x8]
-; CHECK-NEXT: add x8, sp, #24
; CHECK-NEXT: mov v1.s[2], w2
-; CHECK-NEXT: ld1 { v2.s }[1], [x8]
-; CHECK-NEXT: ld1 { v0.s }[3], [x9]
-; CHECK-NEXT: mov v1.s[3], w3
; CHECK-NEXT: ldr x8, [sp, #32]
-; CHECK-NEXT: umull2 v6.2d, v3.4s, v2.4s
+; CHECK-NEXT: umull2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT: ld1 { v0.s }[3], [x10]
+; CHECK-NEXT: mov v1.s[3], w3
; CHECK-NEXT: umull v7.2d, v3.2s, v2.2s
-; CHECK-NEXT: umull2 v4.2d, v1.4s, v0.4s
-; CHECK-NEXT: umull v5.2d, v1.2s, v0.2s
; CHECK-NEXT: mul v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp2 v5.4s, v7.4s, v6.4s
+; CHECK-NEXT: umull2 v5.2d, v1.4s, v0.4s
+; CHECK-NEXT: umull v6.2d, v1.2s, v0.2s
+; CHECK-NEXT: uzp2 v4.4s, v7.4s, v4.4s
; CHECK-NEXT: str d2, [x8, #16]
-; CHECK-NEXT: str q0, [x8]
+; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: uzp2 v5.4s, v6.4s, v5.4s
; CHECK-NEXT: cmtst v4.4s, v4.4s, v4.4s
+; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: cmtst v3.4s, v5.4s, v5.4s
-; CHECK-NEXT: mov w1, v4.s[1]
-; CHECK-NEXT: mov w2, v4.s[2]
-; CHECK-NEXT: mov w3, v4.s[3]
-; CHECK-NEXT: mov w5, v3.s[1]
-; CHECK-NEXT: fmov w0, s4
-; CHECK-NEXT: fmov w4, s3
+; CHECK-NEXT: mov w5, v4.s[1]
+; CHECK-NEXT: fmov w4, s4
+; CHECK-NEXT: mov w1, v3.s[1]
+; CHECK-NEXT: mov w2, v3.s[2]
+; CHECK-NEXT: mov w3, v3.s[3]
+; CHECK-NEXT: fmov w0, s3
; CHECK-NEXT: ret
%t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
%val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -166,29 +166,27 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
; CHECK-NEXT: umull v3.8h, v0.8b, v1.8b
+; CHECK-NEXT: mul v5.16b, v0.16b, v1.16b
; CHECK-NEXT: uzp2 v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: str q5, [x0]
; CHECK-NEXT: cmtst v2.16b, v2.16b, v2.16b
-; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT: zip1 v4.8b, v2.8b, v0.8b
-; CHECK-NEXT: zip2 v2.8b, v2.8b, v0.8b
-; CHECK-NEXT: zip1 v5.8b, v3.8b, v0.8b
-; CHECK-NEXT: zip2 v3.8b, v3.8b, v0.8b
-; CHECK-NEXT: ushll v4.4s, v4.4h, #0
-; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: ushll v5.4s, v5.4h, #0
-; CHECK-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NEXT: shl v4.4s, v4.4s, #31
+; CHECK-NEXT: zip1 v3.8b, v2.8b, v0.8b
+; CHECK-NEXT: zip2 v4.8b, v2.8b, v0.8b
+; CHECK-NEXT: ext v0.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ushll v1.4s, v3.4h, #0
+; CHECK-NEXT: ushll v2.4s, v4.4h, #0
+; CHECK-NEXT: zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT: zip2 v4.8b, v0.8b, v0.8b
+; CHECK-NEXT: shl v1.4s, v1.4s, #31
+; CHECK-NEXT: shl v2.4s, v2.4s, #31
+; CHECK-NEXT: cmlt v0.4s, v1.4s, #0
+; CHECK-NEXT: cmlt v1.4s, v2.4s, #0
+; CHECK-NEXT: ushll v2.4s, v3.4h, #0
+; CHECK-NEXT: ushll v3.4s, v4.4h, #0
; CHECK-NEXT: shl v2.4s, v2.4s, #31
-; CHECK-NEXT: shl v6.4s, v5.4s, #31
; CHECK-NEXT: shl v3.4s, v3.4s, #31
-; CHECK-NEXT: cmlt v4.4s, v4.4s, #0
-; CHECK-NEXT: cmlt v5.4s, v2.4s, #0
-; CHECK-NEXT: cmlt v2.4s, v6.4s, #0
+; CHECK-NEXT: cmlt v2.4s, v2.4s, #0
; CHECK-NEXT: cmlt v3.4s, v3.4s, #0
-; CHECK-NEXT: mul v6.16b, v0.16b, v1.16b
-; CHECK-NEXT: mov v0.16b, v4.16b
-; CHECK-NEXT: mov v1.16b, v5.16b
-; CHECK-NEXT: str q6, [x0]
; CHECK-NEXT: ret
%t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
%val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -262,30 +260,30 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; CHECK: // %bb.0:
; CHECK-NEXT: bic v1.4s, #255, lsl #24
; CHECK-NEXT: bic v0.4s, #255, lsl #24
-; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uzp2 v1.4s, v3.4s, v2.4s
-; CHECK-NEXT: ushr v2.4s, v0.4s, #24
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[1]
-; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
-; CHECK-NEXT: fmov w11, s0
-; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
+; CHECK-NEXT: mul v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT: mov w8, v2.s[3]
+; CHECK-NEXT: mov w10, v2.s[2]
+; CHECK-NEXT: mov w11, v2.s[1]
+; CHECK-NEXT: ushr v1.4s, v2.4s, #24
+; CHECK-NEXT: uzp2 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: cmtst v1.4s, v1.4s, v1.4s
; CHECK-NEXT: sturh w8, [x0, #9]
; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: strh w9, [x0, #6]
-; CHECK-NEXT: lsr w9, w9, #16
-; CHECK-NEXT: sturh w10, [x0, #3]
-; CHECK-NEXT: orn v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: strh w10, [x0, #6]
+; CHECK-NEXT: lsr w10, w10, #16
+; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-NEXT: sturh w11, [x0, #3]
+; CHECK-NEXT: lsr w11, w11, #16
; CHECK-NEXT: strb w8, [x0, #11]
-; CHECK-NEXT: lsr w8, w10, #16
-; CHECK-NEXT: lsr w10, w11, #16
-; CHECK-NEXT: strh w11, [x0]
-; CHECK-NEXT: strb w9, [x0, #8]
-; CHECK-NEXT: strb w8, [x0, #5]
-; CHECK-NEXT: strb w10, [x0, #2]
+; CHECK-NEXT: lsr w8, w9, #16
+; CHECK-NEXT: strh w9, [x0]
+; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: strb w10, [x0, #8]
+; CHECK-NEXT: strb w11, [x0, #5]
+; CHECK-NEXT: strb w8, [x0, #2]
; CHECK-NEXT: ret
%t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
%val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 3b946b8b2e09..82f45e56f833 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -407,17 +407,17 @@ define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT: uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: uaddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT: uaddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT: uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT: add v2.2d, v5.2d, v4.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -432,17 +432,17 @@ define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT: saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT: saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: saddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT: saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT: saddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT: saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT: add v2.2d, v5.2d, v4.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -1029,17 +1029,17 @@ define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT: uaddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: uaddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT: uaddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT: uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT: add v2.2d, v5.2d, v4.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
@@ -1056,17 +1056,17 @@ define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll2 v2.4s, v1.8h, #0
-; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: saddl2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT: saddl2 v5.2d, v0.4s, v1.4s
-; CHECK-NEXT: saddl v2.2d, v3.2s, v2.2s
-; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: add v1.2d, v5.2d, v4.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: saddl2 v4.2d, v3.4s, v1.4s
+; CHECK-NEXT: saddl2 v5.2d, v0.4s, v2.4s
+; CHECK-NEXT: saddl v1.2d, v3.2s, v1.2s
+; CHECK-NEXT: saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT: add v2.2d, v5.2d, v4.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
@@ -1766,29 +1766,29 @@ define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v4.4s, v2.8h, #0
-; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: ushll2 v5.4s, v0.8h, #0
+; CHECK-NEXT: ushll v3.4s, v2.4h, #0
+; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT: ushll2 v4.4s, v0.8h, #0
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v6.4s, v3.4h, #0
-; CHECK-NEXT: ushll v7.4s, v1.4h, #0
-; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0
-; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: uaddl2 v16.2d, v5.4s, v4.4s
-; CHECK-NEXT: uaddl v4.2d, v5.2s, v4.2s
-; CHECK-NEXT: uaddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT: uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT: uaddl2 v5.2d, v4.4s, v2.4s
+; CHECK-NEXT: uaddl2 v6.2d, v0.4s, v3.4s
+; CHECK-NEXT: ushll2 v7.8h, v1.16b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: uaddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT: add v4.2d, v6.2d, v5.2d
+; CHECK-NEXT: uaddl v0.2d, v0.2s, v3.2s
+; CHECK-NEXT: ushll v3.4s, v7.4h, #0
+; CHECK-NEXT: ushll2 v5.4s, v7.8h, #0
+; CHECK-NEXT: ushll2 v6.4s, v1.8h, #0
+; CHECK-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-NEXT: uaddl2 v7.2d, v6.4s, v5.4s
+; CHECK-NEXT: uaddl v5.2d, v6.2s, v5.2s
+; CHECK-NEXT: uaddl2 v6.2d, v1.4s, v3.4s
; CHECK-NEXT: uaddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT: uaddl2 v3.2d, v7.4s, v6.4s
-; CHECK-NEXT: uaddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT: add v5.2d, v5.2d, v16.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: add v2.2d, v6.2d, v7.2d
+; CHECK-NEXT: add v1.2d, v1.2d, v5.2d
; CHECK-NEXT: add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT: add v2.2d, v3.2d, v2.2d
-; CHECK-NEXT: add v1.2d, v6.2d, v1.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v5.2d
; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-NEXT: addp d0, v0.2d
@@ -1808,29 +1808,29 @@ define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll2 v3.8h, v1.16b, #0
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: sshll2 v4.4s, v2.8h, #0
-; CHECK-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NEXT: sshll2 v5.4s, v0.8h, #0
+; CHECK-NEXT: sshll v3.4s, v2.4h, #0
+; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: sshll v6.4s, v3.4h, #0
-; CHECK-NEXT: sshll v7.4s, v1.4h, #0
-; CHECK-NEXT: sshll2 v3.4s, v3.8h, #0
-; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: saddl2 v16.2d, v5.4s, v4.4s
-; CHECK-NEXT: saddl v4.2d, v5.2s, v4.2s
-; CHECK-NEXT: saddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT: saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT: saddl2 v2.2d, v1.4s, v3.4s
+; CHECK-NEXT: saddl2 v5.2d, v4.4s, v2.4s
+; CHECK-NEXT: saddl2 v6.2d, v0.4s, v3.4s
+; CHECK-NEXT: sshll2 v7.8h, v1.16b, #0
+; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: saddl v2.2d, v4.2s, v2.2s
+; CHECK-NEXT: add v4.2d, v6.2d, v5.2d
+; CHECK-NEXT: saddl v0.2d, v0.2s, v3.2s
+; CHECK-NEXT: sshll v3.4s, v7.4h, #0
+; CHECK-NEXT: sshll2 v5.4s, v7.8h, #0
+; CHECK-NEXT: sshll2 v6.4s, v1.8h, #0
+; CHECK-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-NEXT: saddl2 v7.2d, v6.4s, v5.4s
+; CHECK-NEXT: saddl v5.2d, v6.2s, v5.2s
+; CHECK-NEXT: saddl2 v6.2d, v1.4s, v3.4s
; CHECK-NEXT: saddl v1.2d, v1.2s, v3.2s
-; CHECK-NEXT: saddl2 v3.2d, v7.4s, v6.4s
-; CHECK-NEXT: saddl v6.2d, v7.2s, v6.2s
-; CHECK-NEXT: add v5.2d, v5.2d, v16.2d
+; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: add v2.2d, v6.2d, v7.2d
+; CHECK-NEXT: add v1.2d, v1.2d, v5.2d
; CHECK-NEXT: add v0.2d, v0.2d, v4.2d
-; CHECK-NEXT: add v2.2d, v3.2d, v2.2d
-; CHECK-NEXT: add v1.2d, v6.2d, v1.2d
-; CHECK-NEXT: add v0.2d, v0.2d, v5.2d
; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-NEXT: addp d0, v0.2d
@@ -1925,21 +1925,21 @@ entry:
define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v2.2d, v0.2s, #0
-; CHECK-NEXT: ushll v3.2d, v1.2s, #0
-; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ushll v2.2d, v1.2s, #0
+; CHECK-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-NEXT: ushll2 v1.2d, v1.4s, #0
-; CHECK-NEXT: shl v2.2d, v2.2d, #56
+; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-NEXT: shl v3.2d, v3.2d, #56
+; CHECK-NEXT: shl v2.2d, v2.2d, #56
; CHECK-NEXT: shl v0.2d, v0.2d, #56
+; CHECK-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-NEXT: shl v1.2d, v1.2d, #56
; CHECK-NEXT: sshr v2.2d, v2.2d, #56
-; CHECK-NEXT: sshr v3.2d, v3.2d, #56
-; CHECK-NEXT: ssra v2.2d, v0.2d, #56
-; CHECK-NEXT: ssra v3.2d, v1.2d, #56
-; CHECK-NEXT: add v0.2d, v2.2d, v3.2d
+; CHECK-NEXT: ssra v3.2d, v0.2d, #56
+; CHECK-NEXT: ssra v2.2d, v1.2d, #56
+; CHECK-NEXT: add v0.2d, v3.2d, v2.2d
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index fc0f3a10f5b1..5d87506177e0 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -96,9 +96,9 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
define i8 @test_v9i8(<9 x i8> %a) nounwind {
; CHECK-LABEL: test_v9i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov w8, #-1
; CHECK-NEXT: umov w12, v0.b[4]
+; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov v1.b[9], w8
; CHECK-NEXT: mov v1.b[10], w8
; CHECK-NEXT: mov v1.b[11], w8
@@ -129,8 +129,8 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
-; CHECK-NEXT: fmov w9, s1
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: and w0, w9, w8
; CHECK-NEXT: ret
%b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 1a946dd2ca05..51b60332bf5a 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -302,14 +302,14 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7
; CHECK-FP-NEXT: mov v0.h[1], v1.h[0]
; CHECK-FP-NEXT: mvni v1.8h, #4, lsl #8
-; CHECK-FP-NEXT: mov v0.h[2], v2.h[0]
; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8]
; CHECK-FP-NEXT: add x8, sp, #8
-; CHECK-FP-NEXT: mov v0.h[3], v3.h[0]
+; CHECK-FP-NEXT: mov v0.h[2], v2.h[0]
; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8]
; CHECK-FP-NEXT: add x8, sp, #16
-; CHECK-FP-NEXT: mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT: mov v0.h[3], v3.h[0]
; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT: mov v0.h[4], v4.h[0]
; CHECK-FP-NEXT: mov v0.h[5], v5.h[0]
; CHECK-FP-NEXT: mov v0.h[6], v6.h[0]
; CHECK-FP-NEXT: mov v0.h[7], v7.h[0]
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 06aa415b909f..fade974b07dc 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -302,14 +302,14 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7
; CHECK-FP-NEXT: mov v0.h[1], v1.h[0]
; CHECK-FP-NEXT: mvni v1.8h, #132, lsl #8
-; CHECK-FP-NEXT: mov v0.h[2], v2.h[0]
; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8]
; CHECK-FP-NEXT: add x8, sp, #8
-; CHECK-FP-NEXT: mov v0.h[3], v3.h[0]
+; CHECK-FP-NEXT: mov v0.h[2], v2.h[0]
; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8]
; CHECK-FP-NEXT: add x8, sp, #16
-; CHECK-FP-NEXT: mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT: mov v0.h[3], v3.h[0]
; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT: mov v0.h[4], v4.h[0]
; CHECK-FP-NEXT: mov v0.h[5], v5.h[0]
; CHECK-FP-NEXT: mov v0.h[6], v6.h[0]
; CHECK-FP-NEXT: mov v0.h[7], v7.h[0]
diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
index 3e9673acb006..754d9e8eb7ed 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -82,8 +82,8 @@ define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0
define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v2f32_v2f64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mvni.2s v2, #128, lsl #24
; CHECK-NEXT: fcvtn v1.2s, v1.2d
+; CHECK-NEXT: mvni.2s v2, #128, lsl #24
; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <2 x double> %b to <2 x float>
@@ -110,9 +110,9 @@ define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #
; CHECK-LABEL: test_copysign_v4f32_v4f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: fcvtn v1.2s, v1.2d
-; CHECK-NEXT: mvni.4s v3, #128, lsl #24
; CHECK-NEXT: fcvtn2 v1.4s, v2.2d
-; CHECK-NEXT: bif.16b v0, v1, v3
+; CHECK-NEXT: mvni.4s v2, #128, lsl #24
+; CHECK-NEXT: bif.16b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x float>
%r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
@@ -191,21 +191,21 @@ define <4 x half> @test_copysign_v4f16_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0
; NOFP16-NEXT: mov h3, v1[1]
; NOFP16-NEXT: mov h4, v0[1]
-; NOFP16-NEXT: mvni.4s v2, #128, lsl #24
; NOFP16-NEXT: fcvt s5, h1
; NOFP16-NEXT: fcvt s6, h0
; NOFP16-NEXT: mov h7, v1[2]
; NOFP16-NEXT: mov h16, v0[2]
+; NOFP16-NEXT: mvni.4s v2, #128, lsl #24
+; NOFP16-NEXT: mov h1, v1[3]
; NOFP16-NEXT: fcvt s3, h3
; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: mov h1, v1[3]
; NOFP16-NEXT: bit.16b v5, v6, v2
; NOFP16-NEXT: fcvt s6, h7
; NOFP16-NEXT: fcvt s7, h16
+; NOFP16-NEXT: fcvt s1, h1
; NOFP16-NEXT: bit.16b v3, v4, v2
; NOFP16-NEXT: mov h4, v0[3]
; NOFP16-NEXT: fcvt h0, s5
-; NOFP16-NEXT: fcvt s1, h1
; NOFP16-NEXT: bit.16b v6, v7, v2
; NOFP16-NEXT: fcvt h3, s3
; NOFP16-NEXT: fcvt s4, h4
@@ -233,9 +233,9 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
; NOFP16-NEXT: fcvtn v1.4h, v1.4s
; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0
; NOFP16-NEXT: mov h3, v0[1]
-; NOFP16-NEXT: mvni.4s v2, #128, lsl #24
; NOFP16-NEXT: fcvt s5, h0
; NOFP16-NEXT: mov h7, v0[2]
+; NOFP16-NEXT: mvni.4s v2, #128, lsl #24
; NOFP16-NEXT: mov h4, v1[1]
; NOFP16-NEXT: fcvt s6, h1
; NOFP16-NEXT: mov h16, v1[2]
@@ -263,8 +263,8 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
;
; FP16-LABEL: test_copysign_v4f16_v4f32:
; FP16: ; %bb.0:
-; FP16-NEXT: mvni.4h v2, #128, lsl #8
; FP16-NEXT: fcvtn v1.4h, v1.4s
+; FP16-NEXT: mvni.4h v2, #128, lsl #8
; FP16-NEXT: bif.8b v0, v1, v2
; FP16-NEXT: ret
%tmp0 = fptrunc <4 x float> %b to <4 x half>
@@ -278,28 +278,28 @@ define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0
; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0
; NOFP16-NEXT: mov d4, v1[1]
; NOFP16-NEXT: mov h5, v0[1]
-; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
; NOFP16-NEXT: fcvt s1, d1
; NOFP16-NEXT: fcvt s6, h0
; NOFP16-NEXT: mov h7, v0[2]
+; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
; NOFP16-NEXT: fcvt s4, d4
; NOFP16-NEXT: fcvt s5, h5
; NOFP16-NEXT: bit.16b v1, v6, v3
; NOFP16-NEXT: fcvt s6, d2
; NOFP16-NEXT: fcvt s7, h7
-; NOFP16-NEXT: bit.16b v4, v5, v3
; NOFP16-NEXT: mov d2, v2[1]
+; NOFP16-NEXT: bit.16b v4, v5, v3
; NOFP16-NEXT: mov h5, v0[3]
; NOFP16-NEXT: fcvt h0, s1
; NOFP16-NEXT: bit.16b v6, v7, v3
-; NOFP16-NEXT: fcvt h1, s4
; NOFP16-NEXT: fcvt s2, d2
+; NOFP16-NEXT: fcvt h1, s4
; NOFP16-NEXT: fcvt s4, h5
; NOFP16-NEXT: fcvt h5, s6
; NOFP16-NEXT: mov.h v0[1], v1[0]
; NOFP16-NEXT: mov.16b v1, v3
-; NOFP16-NEXT: mov.h v0[2], v5[0]
; NOFP16-NEXT: bsl.16b v1, v4, v2
+; NOFP16-NEXT: mov.h v0[2], v5[0]
; NOFP16-NEXT: fcvt h1, s1
; NOFP16-NEXT: mov.h v0[3], v1[0]
; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0
@@ -307,17 +307,17 @@ define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0
;
; FP16-LABEL: test_copysign_v4f16_v4f64:
; FP16: ; %bb.0:
-; FP16-NEXT: mov d4, v1[1]
+; FP16-NEXT: mov d3, v1[1]
; FP16-NEXT: fcvt h1, d1
-; FP16-NEXT: mvni.4h v3, #128, lsl #8
-; FP16-NEXT: fcvt h4, d4
-; FP16-NEXT: mov.h v1[1], v4[0]
-; FP16-NEXT: fcvt h4, d2
+; FP16-NEXT: fcvt h3, d3
+; FP16-NEXT: mov.h v1[1], v3[0]
+; FP16-NEXT: fcvt h3, d2
; FP16-NEXT: mov d2, v2[1]
-; FP16-NEXT: mov.h v1[2], v4[0]
+; FP16-NEXT: mov.h v1[2], v3[0]
; FP16-NEXT: fcvt h2, d2
; FP16-NEXT: mov.h v1[3], v2[0]
-; FP16-NEXT: bif.8b v0, v1, v3
+; FP16-NEXT: mvni.4h v2, #128, lsl #8
+; FP16-NEXT: bif.8b v0, v1, v2
; FP16-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x half>
%r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0)
@@ -333,33 +333,33 @@ define <8 x half> @test_copysign_v8f16_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; NOFP16: ; %bb.0:
; NOFP16-NEXT: mov h5, v1[1]
; NOFP16-NEXT: mov h6, v0[1]
-; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
; NOFP16-NEXT: fcvt s2, h1
; NOFP16-NEXT: fcvt s4, h0
; NOFP16-NEXT: mov h7, v1[2]
; NOFP16-NEXT: mov h16, v0[2]
+; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
+; NOFP16-NEXT: mov h17, v0[3]
; NOFP16-NEXT: fcvt s5, h5
; NOFP16-NEXT: fcvt s6, h6
-; NOFP16-NEXT: mov h17, v0[3]
; NOFP16-NEXT: mov h18, v0[5]
; NOFP16-NEXT: bit.16b v2, v4, v3
; NOFP16-NEXT: mov h4, v1[3]
; NOFP16-NEXT: fcvt s7, h7
; NOFP16-NEXT: fcvt s16, h16
-; NOFP16-NEXT: bit.16b v5, v6, v3
; NOFP16-NEXT: fcvt s17, h17
+; NOFP16-NEXT: bit.16b v5, v6, v3
; NOFP16-NEXT: mov.16b v6, v3
; NOFP16-NEXT: fcvt s4, h4
-; NOFP16-NEXT: fcvt h2, s2
-; NOFP16-NEXT: fcvt h5, s5
; NOFP16-NEXT: bsl.16b v6, v16, v7
; NOFP16-NEXT: mov h7, v1[4]
; NOFP16-NEXT: mov h16, v0[4]
+; NOFP16-NEXT: fcvt h2, s2
+; NOFP16-NEXT: fcvt h5, s5
; NOFP16-NEXT: bit.16b v4, v17, v3
; NOFP16-NEXT: mov h17, v1[5]
-; NOFP16-NEXT: mov.h v2[1], v5[0]
; NOFP16-NEXT: fcvt s7, h7
; NOFP16-NEXT: fcvt s16, h16
+; NOFP16-NEXT: mov.h v2[1], v5[0]
; NOFP16-NEXT: fcvt h5, s6
; NOFP16-NEXT: fcvt s6, h17
; NOFP16-NEXT: fcvt s17, h18
@@ -403,11 +403,11 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
; NOFP16: ; %bb.0:
; NOFP16-NEXT: fcvtn v1.4h, v1.4s
; NOFP16-NEXT: fcvtn v2.4h, v2.4s
-; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
; NOFP16-NEXT: mov h4, v0[1]
; NOFP16-NEXT: mov h5, v0[4]
; NOFP16-NEXT: fcvt s7, h0
; NOFP16-NEXT: mov h17, v0[2]
+; NOFP16-NEXT: mvni.4s v3, #128, lsl #24
; NOFP16-NEXT: mov h6, v1[1]
; NOFP16-NEXT: fcvt s16, h1
; NOFP16-NEXT: fcvt s4, h4
@@ -425,29 +425,29 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
; NOFP16-NEXT: fcvt h1, s7
; NOFP16-NEXT: mov.16b v7, v3
; NOFP16-NEXT: fcvt h4, s4
+; NOFP16-NEXT: bsl.16b v7, v17, v18
; NOFP16-NEXT: fcvt s6, h6
; NOFP16-NEXT: fcvt s16, h16
-; NOFP16-NEXT: fcvt h5, s5
-; NOFP16-NEXT: bsl.16b v7, v17, v18
; NOFP16-NEXT: mov h17, v0[5]
; NOFP16-NEXT: mov h18, v2[1]
+; NOFP16-NEXT: fcvt h5, s5
; NOFP16-NEXT: mov.h v1[1], v4[0]
-; NOFP16-NEXT: bif.16b v6, v16, v3
; NOFP16-NEXT: fcvt h4, s7
+; NOFP16-NEXT: bif.16b v6, v16, v3
; NOFP16-NEXT: fcvt s7, h17
; NOFP16-NEXT: fcvt s17, h18
-; NOFP16-NEXT: mov h16, v2[2]
-; NOFP16-NEXT: mov h2, v2[3]
-; NOFP16-NEXT: fcvt h6, s6
; NOFP16-NEXT: mov.h v1[2], v4[0]
; NOFP16-NEXT: mov h4, v0[6]
-; NOFP16-NEXT: bif.16b v7, v17, v3
-; NOFP16-NEXT: fcvt s16, h16
+; NOFP16-NEXT: mov h16, v2[2]
+; NOFP16-NEXT: fcvt h6, s6
; NOFP16-NEXT: mov h0, v0[7]
-; NOFP16-NEXT: fcvt s2, h2
+; NOFP16-NEXT: bif.16b v7, v17, v3
+; NOFP16-NEXT: mov h2, v2[3]
; NOFP16-NEXT: fcvt s4, h4
+; NOFP16-NEXT: fcvt s16, h16
; NOFP16-NEXT: mov.h v1[3], v6[0]
; NOFP16-NEXT: fcvt s0, h0
+; NOFP16-NEXT: fcvt s2, h2
; NOFP16-NEXT: bif.16b v4, v16, v3
; NOFP16-NEXT: mov.h v1[4], v5[0]
; NOFP16-NEXT: fcvt h5, s7
@@ -464,9 +464,9 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
; FP16: ; %bb.0:
; FP16-NEXT: fcvtn v2.4h, v2.4s
; FP16-NEXT: fcvtn v1.4h, v1.4s
-; FP16-NEXT: mvni.8h v3, #128, lsl #8
; FP16-NEXT: mov.d v1[1], v2[0]
-; FP16-NEXT: bif.16b v0, v1, v3
+; FP16-NEXT: mvni.8h v2, #128, lsl #8
+; FP16-NEXT: bif.16b v0, v1, v2
; FP16-NEXT: ret
%tmp0 = fptrunc <8 x float> %b to <8 x half>
%r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0)
diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index 763edf825e1f..30ba6f2e3464 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -10,11 +10,11 @@
define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_C1_or_C2_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: adrp x9, .LCPI0_1
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #31
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1]
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
@@ -29,9 +29,9 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: adrp x9, .LCPI1_1
; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_1]
-; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI1_1]
+; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
@@ -41,11 +41,11 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_Cplus1_or_C_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: adrp x9, .LCPI2_1
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #31
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1]
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
@@ -60,9 +60,9 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: adrp x9, .LCPI3_1
; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_1]
-; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1]
+; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
@@ -72,11 +72,11 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_Cminus1_or_C_vec:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: adrp x8, .LCPI4_0
; CHECK-NEXT: adrp x9, .LCPI4_1
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #31
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_1]
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
@@ -91,9 +91,9 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-NEXT: adrp x8, .LCPI5_0
; CHECK-NEXT: adrp x9, .LCPI5_1
; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1]
-; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI5_1]
+; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b
; CHECK-NEXT: ret
%cond = icmp eq <4 x i32> %x, %y
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
index fe34b839f390..afffb854740a 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-neon-instructions.s
@@ -1070,27 +1070,27 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 1 4 0.50 abs d29, d24
-# CHECK-NEXT: 1 4 1.00 abs v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 abs v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 abs v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 abs v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 abs v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 abs v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 abs v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 add d17, d31, d29
-# CHECK-NEXT: 1 4 0.50 add v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 addhn v0.2s, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 addhn v0.4h, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 addhn v0.8b, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 addhn2 v0.16b, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 addhn2 v0.4s, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 addhn2 v0.8h, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 addp v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 addp v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 and v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 bic v0.4h, #15, lsl #8
-# CHECK-NEXT: 1 4 0.50 bic v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 0.50 abs d29, d24
+# CHECK-NEXT: 1 3 1.00 abs v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 abs v0.2d, v0.2d
+# CHECK-NEXT: 1 3 0.50 abs v0.2s, v0.2s
+# CHECK-NEXT: 1 3 0.50 abs v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 abs v0.4s, v0.4s
+# CHECK-NEXT: 1 3 0.50 abs v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 abs v0.8h, v0.8h
+# CHECK-NEXT: 1 2 0.50 add d17, d31, d29
+# CHECK-NEXT: 1 2 0.50 add v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 addhn v0.2s, v0.2d, v0.2d
+# CHECK-NEXT: 1 3 1.00 addhn v0.4h, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 addhn v0.8b, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 addhn2 v0.16b, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 addhn2 v0.4s, v0.2d, v0.2d
+# CHECK-NEXT: 1 3 1.00 addhn2 v0.8h, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 addp v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1 3 0.50 addp v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 1 0.50 and v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 1 0.50 bic v0.4h, #15, lsl #8
+# CHECK-NEXT: 1 1 0.50 bic v0.8b, v0.8b, v0.8b
# CHECK-NEXT: 1 4 1.00 bif v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 bit v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 0.50 bsl v0.8b, v0.8b, v0.8b
@@ -1106,28 +1106,28 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 clz v0.4s, v0.4s
# CHECK-NEXT: 1 4 0.50 clz v0.8b, v0.8b
# CHECK-NEXT: 1 4 1.00 clz v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 cmeq d20, d21, #0
-# CHECK-NEXT: 1 4 0.50 cmeq d20, d21, d22
-# CHECK-NEXT: 1 4 1.00 cmeq v0.16b, v0.16b, #0
-# CHECK-NEXT: 1 4 1.00 cmeq v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 cmge d20, d21, #0
-# CHECK-NEXT: 1 4 0.50 cmge d20, d21, d22
-# CHECK-NEXT: 1 4 0.50 cmge v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 cmge v0.8b, v0.8b, #0
-# CHECK-NEXT: 1 4 0.50 cmgt d20, d21, #0
-# CHECK-NEXT: 1 4 0.50 cmgt d20, d21, d22
-# CHECK-NEXT: 1 4 0.50 cmgt v0.2s, v0.2s, #0
-# CHECK-NEXT: 1 4 1.00 cmgt v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 cmhi d20, d21, d22
-# CHECK-NEXT: 1 4 1.00 cmhi v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 cmhs d20, d21, d22
-# CHECK-NEXT: 1 4 0.50 cmhs v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 cmle d20, d21, #0
-# CHECK-NEXT: 1 4 1.00 cmle v0.2d, v0.2d, #0
-# CHECK-NEXT: 1 4 0.50 cmlt d20, d21, #0
-# CHECK-NEXT: 1 4 1.00 cmlt v0.8h, v0.8h, #0
-# CHECK-NEXT: 1 4 0.50 cmtst d20, d21, d22
-# CHECK-NEXT: 1 4 0.50 cmtst v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 cmeq d20, d21, #0
+# CHECK-NEXT: 1 2 0.50 cmeq d20, d21, d22
+# CHECK-NEXT: 1 2 1.00 cmeq v0.16b, v0.16b, #0
+# CHECK-NEXT: 1 2 1.00 cmeq v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 0.50 cmge d20, d21, #0
+# CHECK-NEXT: 1 2 0.50 cmge d20, d21, d22
+# CHECK-NEXT: 1 2 0.50 cmge v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 cmge v0.8b, v0.8b, #0
+# CHECK-NEXT: 1 2 0.50 cmgt d20, d21, #0
+# CHECK-NEXT: 1 2 0.50 cmgt d20, d21, d22
+# CHECK-NEXT: 1 2 0.50 cmgt v0.2s, v0.2s, #0
+# CHECK-NEXT: 1 2 1.00 cmgt v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1 2 0.50 cmhi d20, d21, d22
+# CHECK-NEXT: 1 2 1.00 cmhi v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 0.50 cmhs d20, d21, d22
+# CHECK-NEXT: 1 2 0.50 cmhs v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 0.50 cmle d20, d21, #0
+# CHECK-NEXT: 1 2 1.00 cmle v0.2d, v0.2d, #0
+# CHECK-NEXT: 1 2 0.50 cmlt d20, d21, #0
+# CHECK-NEXT: 1 2 1.00 cmlt v0.8h, v0.8h, #0
+# CHECK-NEXT: 1 3 0.50 cmtst d20, d21, d22
+# CHECK-NEXT: 1 3 0.50 cmtst v0.2s, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 cnt v0.16b, v0.16b
# CHECK-NEXT: 1 4 0.50 cnt v0.8b, v0.8b
# CHECK-NEXT: 1 2 0.50 dup v0.16b, w28
@@ -1137,7 +1137,7 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 2 0.50 dup v0.4s, w28
# CHECK-NEXT: 1 4 0.50 dup v0.8b, w28
# CHECK-NEXT: 1 2 0.50 dup v0.8h, w28
-# CHECK-NEXT: 1 4 1.00 eor v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 1 1.00 eor v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 ext v0.16b, v0.16b, v0.16b, #3
# CHECK-NEXT: 1 4 0.50 ext v0.8b, v0.8b, v0.8b, #3
# CHECK-NEXT: 1 4 0.50 fabd d29, d24, d20
@@ -1429,8 +1429,8 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 mov d6, v0.d[1]
# CHECK-NEXT: 1 4 0.50 mov h2, v0.h[5]
# CHECK-NEXT: 1 4 0.50 mov s17, v0.s[2]
-# CHECK-NEXT: 1 4 1.00 mov v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 mov v0.8b, v0.8b
+# CHECK-NEXT: 1 1 1.00 mov v0.16b, v0.16b
+# CHECK-NEXT: 1 1 0.50 mov v0.8b, v0.8b
# CHECK-NEXT: 1 4 0.50 movi d15, #0xff00ff00ff00ff
# CHECK-NEXT: 1 4 1.00 movi v0.16b, #31
# CHECK-NEXT: 1 4 1.00 movi v0.2d, #0xff0000ff0000ffff
@@ -1438,31 +1438,31 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 movi v0.4s, #255, lsl #24
# CHECK-NEXT: 1 4 0.50 movi v0.8b, #255
# CHECK-NEXT: 1 4 0.50 mul v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 mvni v0.2s, #0
-# CHECK-NEXT: 1 4 1.00 mvni v0.4s, #16, msl #16
-# CHECK-NEXT: 1 4 0.50 neg d29, d24
-# CHECK-NEXT: 1 4 1.00 neg v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 neg v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 neg v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 neg v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 neg v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 neg v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 neg v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 mvn v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 mvn v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 orn v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 mov v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 orr v0.8h, #31
-# CHECK-NEXT: 1 4 1.00 pmul v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 pmul v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 pmull v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 pmull2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 raddhn v0.2s, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 raddhn v0.4h, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 raddhn v0.8b, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 raddhn2 v0.16b, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 raddhn2 v0.4s, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 raddhn2 v0.8h, v0.4s, v0.4s
+# CHECK-NEXT: 1 1 0.50 mvni v0.2s, #0
+# CHECK-NEXT: 1 1 1.00 mvni v0.4s, #16, msl #16
+# CHECK-NEXT: 1 2 0.50 neg d29, d24
+# CHECK-NEXT: 1 2 1.00 neg v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 neg v0.2d, v0.2d
+# CHECK-NEXT: 1 2 0.50 neg v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 neg v0.4h, v0.4h
+# CHECK-NEXT: 1 2 1.00 neg v0.4s, v0.4s
+# CHECK-NEXT: 1 2 0.50 neg v0.8b, v0.8b
+# CHECK-NEXT: 1 2 1.00 neg v0.8h, v0.8h
+# CHECK-NEXT: 1 1 1.00 mvn v0.16b, v0.16b
+# CHECK-NEXT: 1 1 0.50 mvn v0.8b, v0.8b
+# CHECK-NEXT: 1 1 1.00 orn v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 1 1.00 mov v0.16b, v0.16b
+# CHECK-NEXT: 1 1 1.00 orr v0.8h, #31
+# CHECK-NEXT: 1 3 1.00 pmul v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 0.50 pmul v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 pmull v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 pmull2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 4 2.00 raddhn v0.2s, v0.2d, v0.2d
+# CHECK-NEXT: 1 4 2.00 raddhn v0.4h, v0.4s, v0.4s
+# CHECK-NEXT: 1 4 2.00 raddhn v0.8b, v0.8h, v0.8h
+# CHECK-NEXT: 1 4 2.00 raddhn2 v0.16b, v0.8h, v0.8h
+# CHECK-NEXT: 1 4 2.00 raddhn2 v0.4s, v0.2d, v0.2d
+# CHECK-NEXT: 1 4 2.00 raddhn2 v0.8h, v0.4s, v0.4s
# CHECK-NEXT: 1 4 1.00 rbit v0.16b, v0.16b
# CHECK-NEXT: 1 4 0.50 rbit v0.8b, v0.8b
# CHECK-NEXT: 1 4 0.50 rev16 v21.8b, v1.8b
@@ -1477,56 +1477,56 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 rev64 v2.8h, v4.8h
# CHECK-NEXT: 1 4 0.50 rev64 v4.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 rev64 v6.4s, v8.4s
-# CHECK-NEXT: 1 4 0.50 rshrn v0.2s, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 rshrn v0.4h, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 rshrn v0.8b, v0.8h, #3
-# CHECK-NEXT: 1 4 1.00 rshrn2 v0.16b, v0.8h, #3
-# CHECK-NEXT: 1 4 1.00 rshrn2 v0.4s, v0.2d, #3
-# CHECK-NEXT: 1 4 1.00 rshrn2 v0.8h, v0.4s, #3
-# CHECK-NEXT: 1 4 1.00 rsubhn v0.2s, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 rsubhn v0.4h, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 rsubhn v0.8b, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 rsubhn2 v0.16b, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 rsubhn2 v0.4s, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 rsubhn2 v0.8h, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 saba v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 sabal v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 sabal v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 sabal v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 sabal2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 sabal2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 sabal2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 sabd v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 sabdl v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 sabdl v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 sabdl v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 sabdl2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 sabdl2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 sabdl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 sadalp v0.1d, v0.2s
-# CHECK-NEXT: 1 4 1.00 sadalp v0.2d, v0.4s
-# CHECK-NEXT: 1 4 0.50 sadalp v0.2s, v0.4h
-# CHECK-NEXT: 1 4 0.50 sadalp v0.4h, v0.8b
-# CHECK-NEXT: 1 4 1.00 sadalp v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 sadalp v0.8h, v0.16b
-# CHECK-NEXT: 1 4 1.00 saddl v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 saddl v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 saddl v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 saddl2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 saddl2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 saddl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 saddlp v0.1d, v0.2s
-# CHECK-NEXT: 1 4 1.00 saddlp v0.2d, v0.4s
-# CHECK-NEXT: 1 4 0.50 saddlp v0.2s, v0.4h
-# CHECK-NEXT: 1 4 0.50 saddlp v0.4h, v0.8b
-# CHECK-NEXT: 1 4 1.00 saddlp v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 saddlp v0.8h, v0.16b
-# CHECK-NEXT: 1 4 1.00 saddw v0.2d, v0.2d, v0.2s
-# CHECK-NEXT: 1 4 1.00 saddw v0.4s, v0.4s, v0.4h
-# CHECK-NEXT: 1 4 1.00 saddw v0.8h, v0.8h, v0.8b
-# CHECK-NEXT: 1 4 1.00 saddw2 v0.2d, v0.2d, v0.4s
-# CHECK-NEXT: 1 4 1.00 saddw2 v0.4s, v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 saddw2 v0.8h, v0.8h, v0.16b
+# CHECK-NEXT: 1 3 0.50 rshrn v0.2s, v0.2d, #3
+# CHECK-NEXT: 1 3 0.50 rshrn v0.4h, v0.4s, #3
+# CHECK-NEXT: 1 3 0.50 rshrn v0.8b, v0.8h, #3
+# CHECK-NEXT: 1 3 1.00 rshrn2 v0.16b, v0.8h, #3
+# CHECK-NEXT: 1 3 1.00 rshrn2 v0.4s, v0.2d, #3
+# CHECK-NEXT: 1 3 1.00 rshrn2 v0.8h, v0.4s, #3
+# CHECK-NEXT: 1 4 2.00 rsubhn v0.2s, v0.2d, v0.2d
+# CHECK-NEXT: 1 4 2.00 rsubhn v0.4h, v0.4s, v0.4s
+# CHECK-NEXT: 1 4 2.00 rsubhn v0.8b, v0.8h, v0.8h
+# CHECK-NEXT: 1 4 2.00 rsubhn2 v0.16b, v0.8h, v0.8h
+# CHECK-NEXT: 1 4 2.00 rsubhn2 v0.4s, v0.2d, v0.2d
+# CHECK-NEXT: 1 4 2.00 rsubhn2 v0.8h, v0.4s, v0.4s
+# CHECK-NEXT: 1 4 2.00 saba v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 4 2.00 sabal v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 4 2.00 sabal v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 4 2.00 sabal v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 4 2.00 sabal2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 4 2.00 sabal2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 4 2.00 sabal2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 0.50 sabd v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 sabdl v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 1.00 sabdl v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 sabdl v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 sabdl2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 sabdl2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 sabdl2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 4 2.00 sadalp v0.1d, v0.2s
+# CHECK-NEXT: 1 4 2.00 sadalp v0.2d, v0.4s
+# CHECK-NEXT: 1 4 2.00 sadalp v0.2s, v0.4h
+# CHECK-NEXT: 1 4 2.00 sadalp v0.4h, v0.8b
+# CHECK-NEXT: 1 4 2.00 sadalp v0.4s, v0.8h
+# CHECK-NEXT: 1 4 2.00 sadalp v0.8h, v0.16b
+# CHECK-NEXT: 1 3 1.00 saddl v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 1.00 saddl v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 saddl v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 saddl2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 saddl2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 saddl2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 0.50 saddlp v0.1d, v0.2s
+# CHECK-NEXT: 1 3 1.00 saddlp v0.2d, v0.4s
+# CHECK-NEXT: 1 3 0.50 saddlp v0.2s, v0.4h
+# CHECK-NEXT: 1 3 0.50 saddlp v0.4h, v0.8b
+# CHECK-NEXT: 1 3 1.00 saddlp v0.4s, v0.8h
+# CHECK-NEXT: 1 3 1.00 saddlp v0.8h, v0.16b
+# CHECK-NEXT: 1 3 1.00 saddw v0.2d, v0.2d, v0.2s
+# CHECK-NEXT: 1 3 1.00 saddw v0.4s, v0.4s, v0.4h
+# CHECK-NEXT: 1 3 1.00 saddw v0.8h, v0.8h, v0.8b
+# CHECK-NEXT: 1 3 1.00 saddw2 v0.2d, v0.2d, v0.4s
+# CHECK-NEXT: 1 3 1.00 saddw2 v0.4s, v0.4s, v0.8h
+# CHECK-NEXT: 1 3 1.00 saddw2 v0.8h, v0.8h, v0.16b
# CHECK-NEXT: 1 4 0.50 scvtf d21, d12
# CHECK-NEXT: 1 4 0.50 scvtf d21, d12, #64
# CHECK-NEXT: 1 4 0.50 scvtf s22, s13
@@ -1539,33 +1539,33 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 scvtf v0.4s, v0.4s
# CHECK-NEXT: 1 4 0.50 scvtf v0.4s, v0.4s, #3
# CHECK-NEXT: 1 4 0.50 scvtf v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 shadd v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 shl d7, d10, #12
-# CHECK-NEXT: 1 4 1.00 shl v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 shl v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 shl v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 shl v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 1.00 shll v0.2d, v0.2s, #32
-# CHECK-NEXT: 1 4 1.00 shll v0.4s, v0.4h, #16
-# CHECK-NEXT: 1 4 1.00 shll v0.8h, v0.8b, #8
-# CHECK-NEXT: 1 4 1.00 shll v0.2d, v0.2s, #32
-# CHECK-NEXT: 1 4 1.00 shll v0.4s, v0.4h, #16
-# CHECK-NEXT: 1 4 1.00 shll v0.8h, v0.8b, #8
-# CHECK-NEXT: 1 4 1.00 shll2 v0.2d, v0.4s, #32
-# CHECK-NEXT: 1 4 1.00 shll2 v0.4s, v0.8h, #16
-# CHECK-NEXT: 1 4 1.00 shll2 v0.8h, v0.16b, #8
-# CHECK-NEXT: 1 4 1.00 shll2 v0.2d, v0.4s, #32
-# CHECK-NEXT: 1 4 1.00 shll2 v0.4s, v0.8h, #16
-# CHECK-NEXT: 1 4 1.00 shll2 v0.8h, v0.16b, #8
-# CHECK-NEXT: 1 4 0.50 shrn v0.2s, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 shrn v0.4h, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 shrn v0.8b, v0.8h, #3
-# CHECK-NEXT: 1 4 1.00 shrn2 v0.16b, v0.8h, #3
-# CHECK-NEXT: 1 4 1.00 shrn2 v0.4s, v0.2d, #3
-# CHECK-NEXT: 1 4 1.00 shrn2 v0.8h, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 shsub v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 shsub v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 sli d10, d14, #12
+# CHECK-NEXT: 1 2 0.50 shadd v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 0.50 shl d7, d10, #12
+# CHECK-NEXT: 1 2 0.50 shl v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 2 0.50 shl v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 2 0.50 shl v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 2 0.50 shl v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 2 1.00 shll v0.2d, v0.2s, #32
+# CHECK-NEXT: 1 2 1.00 shll v0.4s, v0.4h, #16
+# CHECK-NEXT: 1 2 1.00 shll v0.8h, v0.8b, #8
+# CHECK-NEXT: 1 2 1.00 shll v0.2d, v0.2s, #32
+# CHECK-NEXT: 1 2 1.00 shll v0.4s, v0.4h, #16
+# CHECK-NEXT: 1 2 1.00 shll v0.8h, v0.8b, #8
+# CHECK-NEXT: 1 2 1.00 shll2 v0.2d, v0.4s, #32
+# CHECK-NEXT: 1 2 1.00 shll2 v0.4s, v0.8h, #16
+# CHECK-NEXT: 1 2 1.00 shll2 v0.8h, v0.16b, #8
+# CHECK-NEXT: 1 2 1.00 shll2 v0.2d, v0.4s, #32
+# CHECK-NEXT: 1 2 1.00 shll2 v0.4s, v0.8h, #16
+# CHECK-NEXT: 1 2 1.00 shll2 v0.8h, v0.16b, #8
+# CHECK-NEXT: 1 2 0.50 shrn v0.2s, v0.2d, #3
+# CHECK-NEXT: 1 2 0.50 shrn v0.4h, v0.4s, #3
+# CHECK-NEXT: 1 2 0.50 shrn v0.8b, v0.8h, #3
+# CHECK-NEXT: 1 2 0.50 shrn2 v0.16b, v0.8h, #3
+# CHECK-NEXT: 1 2 0.50 shrn2 v0.4s, v0.2d, #3
+# CHECK-NEXT: 1 2 0.50 shrn2 v0.8h, v0.4s, #3
+# CHECK-NEXT: 1 2 0.50 shsub v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 shsub v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 sli d10, d14, #12
# CHECK-NEXT: 1 4 1.00 sli v0.16b, v0.16b, #3
# CHECK-NEXT: 1 4 1.00 sli v0.2d, v0.2d, #3
# CHECK-NEXT: 1 4 0.50 sli v0.2s, v0.2s, #3
@@ -1573,18 +1573,18 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 sli v0.4s, v0.4s, #3
# CHECK-NEXT: 1 4 0.50 sli v0.8b, v0.8b, #3
# CHECK-NEXT: 1 4 1.00 sli v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 smax v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 smax v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 smax v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 smaxp v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 smaxp v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 smaxp v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 smin v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 0.50 smax v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 smax v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 smax v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 0.50 smaxp v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 smaxp v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 smaxp v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 1.00 smin v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 smin v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 smin v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 sminp v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 smin v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 1.00 sminp v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 sminp v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 sminp v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 1.00 sminp v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 smlal v0.2d, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 smlal v0.4s, v0.4h, v0.4h
# CHECK-NEXT: 1 4 1.00 smlal v0.8h, v0.8b, v0.8b
@@ -1614,53 +1614,53 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 sqabs v0.4s, v0.4s
# CHECK-NEXT: 1 4 0.50 sqabs v0.8b, v0.8b
# CHECK-NEXT: 1 4 1.00 sqabs v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 sqadd b20, b11, b15
-# CHECK-NEXT: 1 4 1.00 sqadd v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 sqadd v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 sqdmlal d19, s24, s12
-# CHECK-NEXT: 1 4 0.50 sqdmlal d8, s9, v0.s[1]
-# CHECK-NEXT: 1 4 0.50 sqdmlal s0, h0, v0.h[3]
-# CHECK-NEXT: 1 4 0.50 sqdmlal s17, h27, h12
+# CHECK-NEXT: 1 3 0.50 sqadd b20, b11, b15
+# CHECK-NEXT: 1 3 1.00 sqadd v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 0.50 sqadd v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 4 1.00 sqdmlal d19, s24, s12
+# CHECK-NEXT: 1 4 1.00 sqdmlal d8, s9, v0.s[1]
+# CHECK-NEXT: 1 4 1.00 sqdmlal s0, h0, v0.h[3]
+# CHECK-NEXT: 1 4 1.00 sqdmlal s17, h27, h12
# CHECK-NEXT: 1 4 1.00 sqdmlal v0.2d, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 sqdmlal v0.4s, v0.4h, v0.4h
# CHECK-NEXT: 1 4 1.00 sqdmlal2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: 1 4 1.00 sqdmlal2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 sqdmlsl d12, s23, s13
-# CHECK-NEXT: 1 4 0.50 sqdmlsl d8, s9, v0.s[1]
-# CHECK-NEXT: 1 4 0.50 sqdmlsl s0, h0, v0.h[3]
-# CHECK-NEXT: 1 4 0.50 sqdmlsl s14, h12, h25
+# CHECK-NEXT: 1 4 1.00 sqdmlsl d12, s23, s13
+# CHECK-NEXT: 1 4 1.00 sqdmlsl d8, s9, v0.s[1]
+# CHECK-NEXT: 1 4 1.00 sqdmlsl s0, h0, v0.h[3]
+# CHECK-NEXT: 1 4 1.00 sqdmlsl s14, h12, h25
# CHECK-NEXT: 1 4 1.00 sqdmlsl v0.2d, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 sqdmlsl v0.4s, v0.4h, v0.4h
# CHECK-NEXT: 1 4 1.00 sqdmlsl2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: 1 4 1.00 sqdmlsl2 v0.4s, v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 sqdmulh h10, h11, h12
-# CHECK-NEXT: 1 4 0.50 sqdmulh h7, h15, v0.h[3]
-# CHECK-NEXT: 1 4 0.50 sqdmulh s15, s14, v0.s[1]
+# CHECK-NEXT: 1 4 1.00 sqdmulh h7, h15, v0.h[3]
+# CHECK-NEXT: 1 4 1.00 sqdmulh s15, s14, v0.s[1]
# CHECK-NEXT: 1 4 0.50 sqdmulh s20, s21, s2
# CHECK-NEXT: 1 4 0.50 sqdmulh v0.2s, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 sqdmulh v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 sqdmull d1, s1, v0.s[1]
-# CHECK-NEXT: 1 4 0.50 sqdmull d15, s22, s12
-# CHECK-NEXT: 1 4 0.50 sqdmull s1, h1, v0.h[3]
-# CHECK-NEXT: 1 4 0.50 sqdmull s12, h22, h12
+# CHECK-NEXT: 1 4 1.00 sqdmull d1, s1, v0.s[1]
+# CHECK-NEXT: 1 4 1.00 sqdmull d15, s22, s12
+# CHECK-NEXT: 1 4 1.00 sqdmull s1, h1, v0.h[3]
+# CHECK-NEXT: 1 4 1.00 sqdmull s12, h22, h12
# CHECK-NEXT: 1 4 1.00 sqdmull v0.2d, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 sqdmull v0.4s, v0.4h, v0.4h
# CHECK-NEXT: 1 4 1.00 sqdmull2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: 1 4 1.00 sqdmull2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 sqneg b19, b14
-# CHECK-NEXT: 1 4 0.50 sqneg d18, d12
-# CHECK-NEXT: 1 4 0.50 sqneg h21, h15
-# CHECK-NEXT: 1 4 0.50 sqneg s20, s12
-# CHECK-NEXT: 1 4 1.00 sqneg v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 sqneg v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 sqneg v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 sqneg v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 sqneg v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 sqneg v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 sqneg v0.8h, v0.8h
+# CHECK-NEXT: 1 3 0.50 sqneg b19, b14
+# CHECK-NEXT: 1 3 0.50 sqneg d18, d12
+# CHECK-NEXT: 1 3 0.50 sqneg h21, h15
+# CHECK-NEXT: 1 3 0.50 sqneg s20, s12
+# CHECK-NEXT: 1 3 1.00 sqneg v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 sqneg v0.2d, v0.2d
+# CHECK-NEXT: 1 3 0.50 sqneg v0.2s, v0.2s
+# CHECK-NEXT: 1 3 0.50 sqneg v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 sqneg v0.4s, v0.4s
+# CHECK-NEXT: 1 3 0.50 sqneg v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 sqneg v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 sqrdmulh h10, h11, h12
-# CHECK-NEXT: 1 4 0.50 sqrdmulh h7, h15, v0.h[3]
-# CHECK-NEXT: 1 4 0.50 sqrdmulh s15, s14, v0.s[1]
+# CHECK-NEXT: 1 4 1.00 sqrdmulh h7, h15, v0.h[3]
+# CHECK-NEXT: 1 4 1.00 sqrdmulh s15, s14, v0.s[1]
# CHECK-NEXT: 1 4 0.50 sqrdmulh s20, s21, s2
# CHECK-NEXT: 1 4 0.50 sqrdmulh v0.4h, v0.4h, v0.4h
# CHECK-NEXT: 1 4 1.00 sqrdmulh v0.8h, v0.8h, v0.8h
@@ -1732,10 +1732,10 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 sqshrun2 v0.16b, v0.8h, #3
# CHECK-NEXT: 1 4 1.00 sqshrun2 v0.4s, v0.2d, #3
# CHECK-NEXT: 1 4 1.00 sqshrun2 v0.8h, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 sqsub s20, s10, s7
-# CHECK-NEXT: 1 4 1.00 sqsub v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 sqsub v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 sqsub v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 0.50 sqsub s20, s10, s7
+# CHECK-NEXT: 1 3 1.00 sqsub v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1 3 1.00 sqsub v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 0.50 sqsub v0.8b, v0.8b, v0.8b
# CHECK-NEXT: 1 4 0.50 sqxtn b18, h18
# CHECK-NEXT: 1 4 0.50 sqxtn h20, s17
# CHECK-NEXT: 1 4 0.50 sqxtn s19, d14
@@ -1754,10 +1754,10 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 sqxtun2 v0.16b, v0.8h
# CHECK-NEXT: 1 4 1.00 sqxtun2 v0.4s, v0.2d
# CHECK-NEXT: 1 4 1.00 sqxtun2 v0.8h, v0.4s
-# CHECK-NEXT: 1 4 0.50 srhadd v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 srhadd v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 srhadd v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 sri d10, d12, #14
+# CHECK-NEXT: 1 2 0.50 srhadd v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 srhadd v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 srhadd v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 0.50 sri d10, d12, #14
# CHECK-NEXT: 1 4 1.00 sri v0.16b, v0.16b, #3
# CHECK-NEXT: 1 4 1.00 sri v0.2d, v0.2d, #3
# CHECK-NEXT: 1 4 0.50 sri v0.2s, v0.2s, #3
@@ -1765,61 +1765,61 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 sri v0.4s, v0.4s, #3
# CHECK-NEXT: 1 4 0.50 sri v0.8b, v0.8b, #3
# CHECK-NEXT: 1 4 1.00 sri v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 srshl d16, d16, d16
-# CHECK-NEXT: 1 4 0.50 srshl v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 srshl v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 srshl v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 srshr d19, d18, #7
-# CHECK-NEXT: 1 4 1.00 srshr v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 srshr v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 srshr v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 srshr v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 srshr v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 srshr v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 srshr v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 srsra d15, d11, #19
-# CHECK-NEXT: 1 4 1.00 srsra v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 srsra v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 srsra v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 srsra v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 srsra v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 srsra v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 srsra v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 sshl d31, d31, d31
-# CHECK-NEXT: 1 4 1.00 sshl v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 sshl v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 sshl v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 sshl v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 sshll v0.2d, v0.2s, #3
-# CHECK-NEXT: 1 4 1.00 sshll2 v0.4s, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 sshr d15, d16, #12
-# CHECK-NEXT: 1 4 1.00 sshr v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 sshr v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 sshr v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 sshr v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 sshr v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 sshr v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 sshr v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 ssra d18, d12, #21
-# CHECK-NEXT: 1 4 1.00 ssra v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 ssra v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 ssra v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 ssra v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 ssra v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 ssra v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 ssra v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 1.00 ssubl v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 ssubl v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 ssubl v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 ssubl2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 ssubl2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 ssubl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 ssubw v0.2d, v0.2d, v0.2s
-# CHECK-NEXT: 1 4 1.00 ssubw v0.4s, v0.4s, v0.4h
-# CHECK-NEXT: 1 4 1.00 ssubw v0.8h, v0.8h, v0.8b
-# CHECK-NEXT: 1 4 1.00 ssubw2 v0.2d, v0.2d, v0.4s
-# CHECK-NEXT: 1 4 1.00 ssubw2 v0.4s, v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 ssubw2 v0.8h, v0.8h, v0.16b
+# CHECK-NEXT: 1 3 0.50 srshl d16, d16, d16
+# CHECK-NEXT: 1 3 0.50 srshl v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 0.50 srshl v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 0.50 srshl v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 0.50 srshr d19, d18, #7
+# CHECK-NEXT: 1 3 1.00 srshr v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 3 1.00 srshr v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 0.50 srshr v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 0.50 srshr v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 3 1.00 srshr v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 3 0.50 srshr v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 3 1.00 srshr v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 4 2.00 srsra d15, d11, #19
+# CHECK-NEXT: 1 4 2.00 srsra v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 4 2.00 srsra v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 4 2.00 srsra v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 4 2.00 srsra v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 4 2.00 srsra v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 4 2.00 srsra v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 4 2.00 srsra v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 2 0.50 sshl d31, d31, d31
+# CHECK-NEXT: 1 2 1.00 sshl v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1 2 0.50 sshl v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 sshl v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 sshl v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 1.00 sshll v0.2d, v0.2s, #3
+# CHECK-NEXT: 1 2 1.00 sshll2 v0.4s, v0.8h, #3
+# CHECK-NEXT: 1 2 0.50 sshr d15, d16, #12
+# CHECK-NEXT: 1 2 0.50 sshr v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 2 0.50 sshr v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 2 0.50 sshr v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 2 0.50 sshr v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 2 0.50 sshr v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 2 0.50 sshr v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 2 0.50 sshr v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 3 0.50 ssra d18, d12, #21
+# CHECK-NEXT: 1 3 1.00 ssra v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 3 1.00 ssra v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 0.50 ssra v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 0.50 ssra v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 3 1.00 ssra v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 3 0.50 ssra v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 3 1.00 ssra v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 3 1.00 ssubl v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 1.00 ssubl v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 ssubl v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 ssubl2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 ssubl2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 ssubl2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 ssubw v0.2d, v0.2d, v0.2s
+# CHECK-NEXT: 1 3 1.00 ssubw v0.4s, v0.4s, v0.4h
+# CHECK-NEXT: 1 3 1.00 ssubw v0.8h, v0.8h, v0.8b
+# CHECK-NEXT: 1 3 1.00 ssubw2 v0.2d, v0.2d, v0.4s
+# CHECK-NEXT: 1 3 1.00 ssubw2 v0.4s, v0.4s, v0.8h
+# CHECK-NEXT: 1 3 1.00 ssubw2 v0.8h, v0.8h, v0.16b
# CHECK-NEXT: 1 4 1.00 * st1 { v0.16b }, [x0]
# CHECK-NEXT: 2 5 2.00 * st1 { v0.2d, v1.2d, v2.2d }, [x0], #48
# CHECK-NEXT: 1 5 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
@@ -1842,19 +1842,19 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 2 5 4.00 * st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], #64
# CHECK-NEXT: 1 5 2.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0]
# CHECK-NEXT: 2 5 2.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
-# CHECK-NEXT: 1 4 0.50 sub d15, d5, d16
-# CHECK-NEXT: 1 4 1.00 sub v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 suqadd b19, b14
-# CHECK-NEXT: 1 4 0.50 suqadd d18, d22
-# CHECK-NEXT: 1 4 0.50 suqadd h20, h15
-# CHECK-NEXT: 1 4 0.50 suqadd s21, s12
-# CHECK-NEXT: 1 4 1.00 suqadd v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 suqadd v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 suqadd v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 suqadd v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 suqadd v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 suqadd v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 suqadd v0.8h, v0.8h
+# CHECK-NEXT: 1 2 0.50 sub d15, d5, d16
+# CHECK-NEXT: 1 2 1.00 sub v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1 3 0.50 suqadd b19, b14
+# CHECK-NEXT: 1 3 0.50 suqadd d18, d22
+# CHECK-NEXT: 1 3 0.50 suqadd h20, h15
+# CHECK-NEXT: 1 3 0.50 suqadd s21, s12
+# CHECK-NEXT: 1 3 1.00 suqadd v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 suqadd v0.2d, v0.2d
+# CHECK-NEXT: 1 3 0.50 suqadd v0.2s, v0.2s
+# CHECK-NEXT: 1 3 0.50 suqadd v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 suqadd v0.4s, v0.4s
+# CHECK-NEXT: 1 3 0.50 suqadd v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 suqadd v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 tbl v0.16b, { v0.16b }, v0.16b
# CHECK-NEXT: 1 4 1.00 tbl v0.16b, { v0.16b, v1.16b }, v0.16b
# CHECK-NEXT: 1 4 1.00 tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v0.16b
@@ -1885,44 +1885,44 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 trn2 v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1 4 0.50 trn2 v0.8b, v0.8b, v0.8b
# CHECK-NEXT: 1 4 1.00 trn2 v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 uaba v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 uabal v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 uabal v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 uabal v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 uabal2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 uabal2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 uabal2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 uabd v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 uabdl v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 uabdl v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 uabdl v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 uabdl2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 uabdl2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 uabdl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 uadalp v0.1d, v0.2s
-# CHECK-NEXT: 1 4 1.00 uadalp v0.2d, v0.4s
-# CHECK-NEXT: 1 4 0.50 uadalp v0.2s, v0.4h
-# CHECK-NEXT: 1 4 0.50 uadalp v0.4h, v0.8b
-# CHECK-NEXT: 1 4 1.00 uadalp v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 uadalp v0.8h, v0.16b
-# CHECK-NEXT: 1 4 1.00 uaddl v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 uaddl v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 uaddl v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 uaddl2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 uaddl2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 uaddl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 uaddlp v0.1d, v0.2s
-# CHECK-NEXT: 1 4 1.00 uaddlp v0.2d, v0.4s
-# CHECK-NEXT: 1 4 0.50 uaddlp v0.2s, v0.4h
-# CHECK-NEXT: 1 4 0.50 uaddlp v0.4h, v0.8b
-# CHECK-NEXT: 1 4 1.00 uaddlp v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 uaddlp v0.8h, v0.16b
-# CHECK-NEXT: 1 4 1.00 uaddw v0.2d, v0.2d, v0.2s
-# CHECK-NEXT: 1 4 1.00 uaddw v0.4s, v0.4s, v0.4h
-# CHECK-NEXT: 1 4 1.00 uaddw v0.8h, v0.8h, v0.8b
-# CHECK-NEXT: 1 4 1.00 uaddw2 v0.2d, v0.2d, v0.4s
-# CHECK-NEXT: 1 4 1.00 uaddw2 v0.4s, v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 uaddw2 v0.8h, v0.8h, v0.16b
+# CHECK-NEXT: 1 4 2.00 uaba v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 4 2.00 uabal v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 4 2.00 uabal v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 4 2.00 uabal v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 4 2.00 uabal2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 4 2.00 uabal2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 4 2.00 uabal2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 0.50 uabd v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 uabdl v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 1.00 uabdl v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 uabdl v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 uabdl2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 uabdl2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 uabdl2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 4 2.00 uadalp v0.1d, v0.2s
+# CHECK-NEXT: 1 4 2.00 uadalp v0.2d, v0.4s
+# CHECK-NEXT: 1 4 2.00 uadalp v0.2s, v0.4h
+# CHECK-NEXT: 1 4 2.00 uadalp v0.4h, v0.8b
+# CHECK-NEXT: 1 4 2.00 uadalp v0.4s, v0.8h
+# CHECK-NEXT: 1 4 2.00 uadalp v0.8h, v0.16b
+# CHECK-NEXT: 1 3 1.00 uaddl v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 1.00 uaddl v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 uaddl v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 uaddl2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 uaddl2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 uaddl2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 0.50 uaddlp v0.1d, v0.2s
+# CHECK-NEXT: 1 3 1.00 uaddlp v0.2d, v0.4s
+# CHECK-NEXT: 1 3 0.50 uaddlp v0.2s, v0.4h
+# CHECK-NEXT: 1 3 0.50 uaddlp v0.4h, v0.8b
+# CHECK-NEXT: 1 3 1.00 uaddlp v0.4s, v0.8h
+# CHECK-NEXT: 1 3 1.00 uaddlp v0.8h, v0.16b
+# CHECK-NEXT: 1 3 1.00 uaddw v0.2d, v0.2d, v0.2s
+# CHECK-NEXT: 1 3 1.00 uaddw v0.4s, v0.4s, v0.4h
+# CHECK-NEXT: 1 3 1.00 uaddw v0.8h, v0.8h, v0.8b
+# CHECK-NEXT: 1 3 1.00 uaddw2 v0.2d, v0.2d, v0.4s
+# CHECK-NEXT: 1 3 1.00 uaddw2 v0.4s, v0.4s, v0.8h
+# CHECK-NEXT: 1 3 1.00 uaddw2 v0.8h, v0.8h, v0.16b
# CHECK-NEXT: 1 4 0.50 ucvtf d21, d14
# CHECK-NEXT: 1 4 0.50 ucvtf d21, d14, #64
# CHECK-NEXT: 1 4 0.50 ucvtf s22, s13
@@ -1935,21 +1935,21 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s
# CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s, #3
# CHECK-NEXT: 1 4 0.50 ucvtf v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 uhadd v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 uhadd v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 uhsub v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 umax v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 uhadd v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 uhadd v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 1.00 uhsub v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1 2 1.00 umax v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 umax v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 umax v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 umaxp v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 umax v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 1.00 umaxp v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 umaxp v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 umaxp v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 umin v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 umin v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 umin v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 0.50 uminp v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 uminp v0.4h, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 0.50 uminp v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 1.00 umaxp v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 0.50 umin v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 umin v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 umin v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: 1 2 0.50 uminp v0.2s, v0.2s, v0.2s
+# CHECK-NEXT: 1 2 0.50 uminp v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 2 0.50 uminp v0.8b, v0.8b, v0.8b
# CHECK-NEXT: 1 4 1.00 umlal v0.2d, v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 umlal v0.4s, v0.4h, v0.4h
# CHECK-NEXT: 1 4 1.00 umlal v0.8h, v0.8b, v0.8b
@@ -1968,8 +1968,8 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 umull2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: 1 4 1.00 umull2 v0.4s, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 umull2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 0.50 uqadd h0, h1, h5
-# CHECK-NEXT: 1 4 1.00 uqadd v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 0.50 uqadd h0, h1, h5
+# CHECK-NEXT: 1 3 1.00 uqadd v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 uqrshl b11, b20, b30
# CHECK-NEXT: 1 4 0.50 uqrshl s23, s20, s16
# CHECK-NEXT: 1 4 1.00 uqrshl v0.16b, v0.16b, v0.16b
@@ -2011,8 +2011,8 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 uqshrn2 v0.16b, v0.8h, #3
# CHECK-NEXT: 1 4 1.00 uqshrn2 v0.4s, v0.2d, #3
# CHECK-NEXT: 1 4 1.00 uqshrn2 v0.8h, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 uqsub d16, d16, d16
-# CHECK-NEXT: 1 4 0.50 uqsub v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 0.50 uqsub d16, d16, d16
+# CHECK-NEXT: 1 3 0.50 uqsub v0.4h, v0.4h, v0.4h
# CHECK-NEXT: 1 4 0.50 uqxtn b18, h18
# CHECK-NEXT: 1 4 0.50 uqxtn h20, s17
# CHECK-NEXT: 1 4 0.50 uqxtn s19, d14
@@ -2024,77 +2024,77 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 4 1.00 uqxtn2 v0.8h, v0.4s
# CHECK-NEXT: 1 4 0.50 urecpe v0.2s, v0.2s
# CHECK-NEXT: 1 4 1.00 urecpe v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 urhadd v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 urhadd v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 urhadd v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 urshl d8, d7, d4
-# CHECK-NEXT: 1 4 1.00 urshl v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 urshl v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1 4 1.00 urshl v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 urshl v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 urshr d20, d23, #31
-# CHECK-NEXT: 1 4 1.00 urshr v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 urshr v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 urshr v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 urshr v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 urshr v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 urshr v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 urshr v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 2 1.00 urhadd v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 urhadd v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1 2 1.00 urhadd v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 0.50 urshl d8, d7, d4
+# CHECK-NEXT: 1 3 1.00 urshl v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 urshl v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1 3 1.00 urshl v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 urshl v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 0.50 urshr d20, d23, #31
+# CHECK-NEXT: 1 3 1.00 urshr v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 3 1.00 urshr v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 0.50 urshr v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 0.50 urshr v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 3 1.00 urshr v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 3 0.50 urshr v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 3 1.00 urshr v0.8h, v0.8h, #3
# CHECK-NEXT: 1 12 9.00 ursqrte v0.2s, v0.2s
# CHECK-NEXT: 1 12 9.00 ursqrte v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 ursra d18, d10, #13
-# CHECK-NEXT: 1 4 1.00 ursra v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 ursra v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 ursra v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 ursra v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 ursra v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 ursra v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 ursra v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 ushl d0, d0, d0
-# CHECK-NEXT: 1 4 1.00 ushl v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 ushl v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 ushl v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 ushll v0.4s, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 ushll2 v0.8h, v0.16b, #3
-# CHECK-NEXT: 1 4 0.50 ushr d10, d17, #18
-# CHECK-NEXT: 1 4 1.00 ushr v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 ushr v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 ushr v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 ushr v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 ushr v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 ushr v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 ushr v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 0.50 usqadd b19, b14
-# CHECK-NEXT: 1 4 0.50 usqadd d18, d22
-# CHECK-NEXT: 1 4 0.50 usqadd h20, h15
-# CHECK-NEXT: 1 4 0.50 usqadd s21, s12
-# CHECK-NEXT: 1 4 1.00 usqadd v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 usqadd v0.2d, v0.2d
-# CHECK-NEXT: 1 4 0.50 usqadd v0.2s, v0.2s
-# CHECK-NEXT: 1 4 0.50 usqadd v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 usqadd v0.4s, v0.4s
-# CHECK-NEXT: 1 4 0.50 usqadd v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 usqadd v0.8h, v0.8h
-# CHECK-NEXT: 1 4 0.50 usra d20, d13, #61
-# CHECK-NEXT: 1 4 1.00 usra v0.16b, v0.16b, #3
-# CHECK-NEXT: 1 4 1.00 usra v0.2d, v0.2d, #3
-# CHECK-NEXT: 1 4 0.50 usra v0.2s, v0.2s, #3
-# CHECK-NEXT: 1 4 0.50 usra v0.4h, v0.4h, #3
-# CHECK-NEXT: 1 4 1.00 usra v0.4s, v0.4s, #3
-# CHECK-NEXT: 1 4 0.50 usra v0.8b, v0.8b, #3
-# CHECK-NEXT: 1 4 1.00 usra v0.8h, v0.8h, #3
-# CHECK-NEXT: 1 4 1.00 usubl v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: 1 4 1.00 usubl v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: 1 4 1.00 usubl v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: 1 4 1.00 usubl2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: 1 4 1.00 usubl2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: 1 4 1.00 usubl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: 1 4 1.00 usubw v0.2d, v0.2d, v0.2s
-# CHECK-NEXT: 1 4 1.00 usubw v0.4s, v0.4s, v0.4h
-# CHECK-NEXT: 1 4 1.00 usubw v0.8h, v0.8h, v0.8b
-# CHECK-NEXT: 1 4 1.00 usubw2 v0.2d, v0.2d, v0.4s
-# CHECK-NEXT: 1 4 1.00 usubw2 v0.4s, v0.4s, v0.8h
-# CHECK-NEXT: 1 4 1.00 usubw2 v0.8h, v0.8h, v0.16b
+# CHECK-NEXT: 1 4 2.00 ursra d18, d10, #13
+# CHECK-NEXT: 1 4 2.00 ursra v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 4 2.00 ursra v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 4 2.00 ursra v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 4 2.00 ursra v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 4 2.00 ursra v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 4 2.00 ursra v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 4 2.00 ursra v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 2 0.50 ushl d0, d0, d0
+# CHECK-NEXT: 1 2 1.00 ushl v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 1 2 1.00 ushl v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1 2 1.00 ushl v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 1 2 1.00 ushll v0.4s, v0.4h, #3
+# CHECK-NEXT: 1 2 1.00 ushll2 v0.8h, v0.16b, #3
+# CHECK-NEXT: 1 2 0.50 ushr d10, d17, #18
+# CHECK-NEXT: 1 2 0.50 ushr v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 2 0.50 ushr v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 2 0.50 ushr v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 2 0.50 ushr v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 2 0.50 ushr v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 2 0.50 ushr v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 2 0.50 ushr v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 3 0.50 usqadd b19, b14
+# CHECK-NEXT: 1 3 0.50 usqadd d18, d22
+# CHECK-NEXT: 1 3 0.50 usqadd h20, h15
+# CHECK-NEXT: 1 3 0.50 usqadd s21, s12
+# CHECK-NEXT: 1 3 1.00 usqadd v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 usqadd v0.2d, v0.2d
+# CHECK-NEXT: 1 3 0.50 usqadd v0.2s, v0.2s
+# CHECK-NEXT: 1 3 0.50 usqadd v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 usqadd v0.4s, v0.4s
+# CHECK-NEXT: 1 3 0.50 usqadd v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 usqadd v0.8h, v0.8h
+# CHECK-NEXT: 1 3 0.50 usra d20, d13, #61
+# CHECK-NEXT: 1 3 1.00 usra v0.16b, v0.16b, #3
+# CHECK-NEXT: 1 3 1.00 usra v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 0.50 usra v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 0.50 usra v0.4h, v0.4h, #3
+# CHECK-NEXT: 1 3 1.00 usra v0.4s, v0.4s, #3
+# CHECK-NEXT: 1 3 0.50 usra v0.8b, v0.8b, #3
+# CHECK-NEXT: 1 3 1.00 usra v0.8h, v0.8h, #3
+# CHECK-NEXT: 1 3 1.00 usubl v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: 1 3 1.00 usubl v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: 1 3 1.00 usubl v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: 1 3 1.00 usubl2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: 1 3 1.00 usubl2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: 1 3 1.00 usubl2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: 1 3 1.00 usubw v0.2d, v0.2d, v0.2s
+# CHECK-NEXT: 1 3 1.00 usubw v0.4s, v0.4s, v0.4h
+# CHECK-NEXT: 1 3 1.00 usubw v0.8h, v0.8h, v0.8b
+# CHECK-NEXT: 1 3 1.00 usubw2 v0.2d, v0.2d, v0.4s
+# CHECK-NEXT: 1 3 1.00 usubw2 v0.4s, v0.4s, v0.8h
+# CHECK-NEXT: 1 3 1.00 usubw2 v0.8h, v0.8h, v0.16b
# CHECK-NEXT: 1 4 1.00 uzp1 v0.16b, v0.16b, v0.16b
# CHECK-NEXT: 1 4 1.00 uzp1 v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1 4 0.50 uzp1 v0.2s, v0.2s, v0.2s
@@ -2146,7 +2146,7 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8]
-# CHECK-NEXT: - - - - 716.50 716.50 197.00 3.00 3.00 107.00 - 52.00
+# CHECK-NEXT: - - - - 780.00 780.00 197.00 3.00 3.00 107.00 - 52.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3.0] [3.1] [4] [5.0] [5.1] [6] [7] [8] Instructions:
@@ -2537,12 +2537,12 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - pmul v0.8b, v0.8b, v0.8b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - pmull v0.8h, v0.8b, v0.8b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - pmull2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - raddhn v0.2s, v0.2d, v0.2d
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - raddhn v0.4h, v0.4s, v0.4s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - raddhn v0.8b, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - raddhn2 v0.16b, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - raddhn2 v0.4s, v0.2d, v0.2d
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - raddhn2 v0.8h, v0.4s, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - raddhn v0.2s, v0.2d, v0.2d
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - raddhn v0.4h, v0.4s, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - raddhn v0.8b, v0.8h, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - raddhn2 v0.16b, v0.8h, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - raddhn2 v0.4s, v0.2d, v0.2d
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - raddhn2 v0.8h, v0.4s, v0.4s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rbit v0.16b, v0.16b
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - rbit v0.8b, v0.8b
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - rev16 v21.8b, v1.8b
@@ -2563,19 +2563,19 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rshrn2 v0.16b, v0.8h, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rshrn2 v0.4s, v0.2d, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rshrn2 v0.8h, v0.4s, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rsubhn v0.2s, v0.2d, v0.2d
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rsubhn v0.4h, v0.4s, v0.4s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rsubhn v0.8b, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rsubhn2 v0.16b, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rsubhn2 v0.4s, v0.2d, v0.2d
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - rsubhn2 v0.8h, v0.4s, v0.4s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - saba v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabal v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabal v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabal v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabal2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabal2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabal2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - rsubhn v0.2s, v0.2d, v0.2d
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - rsubhn v0.4h, v0.4s, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - rsubhn v0.8b, v0.8h, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - rsubhn2 v0.16b, v0.8h, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - rsubhn2 v0.4s, v0.2d, v0.2d
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - rsubhn2 v0.8h, v0.4s, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - saba v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sabal v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sabal v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sabal v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sabal2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sabal2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sabal2 v0.8h, v0.16b, v0.16b
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sabd v0.4h, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabdl v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabdl v0.4s, v0.4h, v0.4h
@@ -2583,12 +2583,12 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabdl2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabdl2 v0.4s, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sabdl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sadalp v0.1d, v0.2s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sadalp v0.2d, v0.4s
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sadalp v0.2s, v0.4h
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sadalp v0.4h, v0.8b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sadalp v0.4s, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sadalp v0.8h, v0.16b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sadalp v0.1d, v0.2s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sadalp v0.2d, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sadalp v0.2s, v0.4h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sadalp v0.4h, v0.8b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sadalp v0.4s, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - sadalp v0.8h, v0.16b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - saddl v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - saddl v0.4s, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - saddl v0.8h, v0.8b, v0.8b
@@ -2621,10 +2621,10 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - scvtf v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shadd v0.8b, v0.8b, v0.8b
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shl d7, d10, #12
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shl v0.16b, v0.16b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shl v0.2d, v0.2d, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shl v0.16b, v0.16b, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shl v0.2d, v0.2d, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shl v0.4h, v0.4h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shl v0.4s, v0.4s, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shl v0.4s, v0.4s, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shll v0.2d, v0.2s, #32
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shll v0.4s, v0.4h, #16
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shll v0.8h, v0.8b, #8
@@ -2640,9 +2640,9 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shrn v0.2s, v0.2d, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shrn v0.4h, v0.4s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shrn v0.8b, v0.8h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shrn2 v0.16b, v0.8h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shrn2 v0.4s, v0.2d, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - shrn2 v0.8h, v0.4s, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shrn2 v0.16b, v0.8h, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shrn2 v0.4s, v0.2d, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shrn2 v0.8h, v0.4s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shsub v0.2s, v0.2s, v0.2s
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - shsub v0.4h, v0.4h, v0.4h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sli d10, d14, #12
@@ -2697,32 +2697,32 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqadd b20, b11, b15
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqadd v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqadd v0.2s, v0.2s, v0.2s
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlal d19, s24, s12
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlal d8, s9, v0.s[1]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlal s0, h0, v0.h[3]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlal s17, h27, h12
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal d19, s24, s12
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal d8, s9, v0.s[1]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal s0, h0, v0.h[3]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal s17, h27, h12
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal v0.4s, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlal2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlsl d12, s23, s13
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlsl d8, s9, v0.s[1]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlsl s0, h0, v0.h[3]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmlsl s14, h12, h25
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl d12, s23, s13
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl d8, s9, v0.s[1]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl s0, h0, v0.h[3]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl s14, h12, h25
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl v0.4s, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmlsl2 v0.4s, v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmulh h10, h11, h12
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmulh h7, h15, v0.h[3]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmulh s15, s14, v0.s[1]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmulh h7, h15, v0.h[3]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmulh s15, s14, v0.s[1]
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmulh s20, s21, s2
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmulh v0.2s, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmulh v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmull d1, s1, v0.s[1]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmull d15, s22, s12
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmull s1, h1, v0.h[3]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqdmull s12, h22, h12
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull d1, s1, v0.s[1]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull d15, s22, s12
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull s1, h1, v0.h[3]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull s12, h22, h12
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull v0.4s, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqdmull2 v0.2d, v0.4s, v0.4s
@@ -2739,8 +2739,8 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqneg v0.8b, v0.8b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqneg v0.8h, v0.8h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqrdmulh h10, h11, h12
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqrdmulh h7, h15, v0.h[3]
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqrdmulh s15, s14, v0.s[1]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqrdmulh h7, h15, v0.h[3]
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqrdmulh s15, s14, v0.s[1]
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqrdmulh s20, s21, s2
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sqrdmulh v0.4h, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sqrdmulh v0.8h, v0.8h, v0.8h
@@ -2857,29 +2857,29 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - srshr v0.4s, v0.4s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - srshr v0.8b, v0.8b, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - srshr v0.8h, v0.8h, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - srsra d15, d11, #19
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - srsra v0.16b, v0.16b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - srsra v0.2d, v0.2d, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - srsra v0.2s, v0.2s, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - srsra v0.4h, v0.4h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - srsra v0.4s, v0.4s, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - srsra v0.8b, v0.8b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - srsra v0.8h, v0.8h, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra d15, d11, #19
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.16b, v0.16b, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.2d, v0.2d, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.2s, v0.2s, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.4h, v0.4h, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.4s, v0.4s, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.8b, v0.8b, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - srsra v0.8h, v0.8h, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshl d31, d31, d31
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshl v0.2d, v0.2d, v0.2d
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshl v0.2s, v0.2s, v0.2s
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshl v0.4h, v0.4h, v0.4h
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshl v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshll v0.2d, v0.2s, #3
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshll v0.2d, v0.2s, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshll2 v0.4s, v0.8h, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr d15, d16, #12
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshr v0.16b, v0.16b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshr v0.2d, v0.2d, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.16b, v0.16b, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.2d, v0.2d, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.2s, v0.2s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.4h, v0.4h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshr v0.4s, v0.4s, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.4s, v0.4s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.8b, v0.8b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - sshr v0.8h, v0.8h, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - sshr v0.8h, v0.8h, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ssra d18, d12, #21
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ssra v0.16b, v0.16b, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ssra v0.2d, v0.2d, #3
@@ -2965,13 +2965,13 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - trn2 v0.4s, v0.4s, v0.4s
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - trn2 v0.8b, v0.8b, v0.8b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - trn2 v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - uaba v0.8b, v0.8b, v0.8b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabal v0.2d, v0.2s, v0.2s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabal v0.4s, v0.4h, v0.4h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabal v0.8h, v0.8b, v0.8b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabal2 v0.2d, v0.4s, v0.4s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabal2 v0.4s, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabal2 v0.8h, v0.16b, v0.16b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uaba v0.8b, v0.8b, v0.8b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uabal v0.2d, v0.2s, v0.2s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uabal v0.4s, v0.4h, v0.4h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uabal v0.8h, v0.8b, v0.8b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uabal2 v0.2d, v0.4s, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uabal2 v0.4s, v0.8h, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uabal2 v0.8h, v0.16b, v0.16b
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - uabd v0.4h, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabdl v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabdl v0.4s, v0.4h, v0.4h
@@ -2979,12 +2979,12 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabdl2 v0.2d, v0.4s, v0.4s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabdl2 v0.4s, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uabdl2 v0.8h, v0.16b, v0.16b
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - uadalp v0.1d, v0.2s
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uadalp v0.2d, v0.4s
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - uadalp v0.2s, v0.4h
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - uadalp v0.4h, v0.8b
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uadalp v0.4s, v0.8h
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uadalp v0.8h, v0.16b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uadalp v0.1d, v0.2s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uadalp v0.2d, v0.4s
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uadalp v0.2s, v0.4h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uadalp v0.4h, v0.8b
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uadalp v0.4s, v0.8h
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - uadalp v0.8h, v0.16b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uaddl v0.2d, v0.2s, v0.2s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uaddl v0.4s, v0.4h, v0.4h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - uaddl v0.8h, v0.8b, v0.8b
@@ -3122,28 +3122,28 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - urshr v0.8h, v0.8h, #3
# CHECK-NEXT: - - - - - - 9.00 - - - - - ursqrte v0.2s, v0.2s
# CHECK-NEXT: - - - - - - 9.00 - - - - - ursqrte v0.4s, v0.4s
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ursra d18, d10, #13
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ursra v0.16b, v0.16b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ursra v0.2d, v0.2d, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ursra v0.2s, v0.2s, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ursra v0.4h, v0.4h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ursra v0.4s, v0.4s, #3
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ursra v0.8b, v0.8b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ursra v0.8h, v0.8h, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra d18, d10, #13
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.16b, v0.16b, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.2d, v0.2d, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.2s, v0.2s, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.4h, v0.4h, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.4s, v0.4s, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.8b, v0.8b, #3
+# CHECK-NEXT: - - - - 2.00 2.00 - - - - - - ursra v0.8h, v0.8h, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushl d0, d0, d0
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushl v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushl v0.4s, v0.4s, v0.4s
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushl v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushll v0.4s, v0.4h, #3
+# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushll v0.4s, v0.4h, #3
# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushll2 v0.8h, v0.16b, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr d10, d17, #18
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushr v0.16b, v0.16b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushr v0.2d, v0.2d, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.16b, v0.16b, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.2d, v0.2d, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.2s, v0.2s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.4h, v0.4h, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushr v0.4s, v0.4s, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.4s, v0.4s, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.8b, v0.8b, #3
-# CHECK-NEXT: - - - - 1.00 1.00 - - - - - - ushr v0.8h, v0.8h, #3
+# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - ushr v0.8h, v0.8h, #3
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - usqadd b19, b14
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - usqadd d18, d22
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - usqadd h20, h15
More information about the llvm-commits
mailing list