[llvm] 4621e17 - [DAGCombiner] Relax condition for extract_vector_elt combine (#157658)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 00:51:56 PDT 2025
Author: ZhaoQi
Date: 2025-09-10T15:51:52+08:00
New Revision: 4621e17dee138381bd11c1b8dfe656ab8238cc1a
URL: https://github.com/llvm/llvm-project/commit/4621e17dee138381bd11c1b8dfe656ab8238cc1a
DIFF: https://github.com/llvm/llvm-project/commit/4621e17dee138381bd11c1b8dfe656ab8238cc1a.diff
LOG: [DAGCombiner] Relax condition for extract_vector_elt combine (#157658)
Checking `isOperationLegalOrCustom` instead of `isOperationLegal` allows
more optimization opportunities. In particular, if a target wants to
mark `extract_vector_elt` as `Custom` rather than `Legal` in order to
optimize some certain cases, this combiner would otherwise miss some
improvements.
Previously, using `isOperationLegalOrCustom` was avoided due to the risk
of getting stuck in infinite loops (as noted in
https://github.com/llvm/llvm-project/commit/61ec738b60a4fb47ec9b7195de55f1ecb5cbdb45).
After testing, the issue no longer reproduces, but the coverage is
limited to the regression/unit tests and the test-suite.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/shufflevector.ll
llvm/test/CodeGen/Thumb2/active_lane_mask.ll
llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
llvm/test/CodeGen/Thumb2/mve-vabdus.ll
llvm/test/CodeGen/Thumb2/mve-vld2.ll
llvm/test/CodeGen/Thumb2/mve-vld3.ll
llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
llvm/test/CodeGen/Thumb2/mve-vld4.ll
llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
llvm/test/CodeGen/Thumb2/mve-vst3.ll
llvm/test/CodeGen/WebAssembly/vector-reduce.ll
llvm/test/CodeGen/X86/avx512fp16-mov.ll
llvm/test/CodeGen/X86/test-shrink-bug.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/vec_umulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d130efe96b56b..97a3d36a67103 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23933,8 +23933,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
// scalar_to_vector here as well.
if (!LegalOperations ||
- // FIXME: Should really be just isOperationLegalOrCustom.
- TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
DAG.getVectorIdxConstant(OrigElt, DL));
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 9fd5e65086782..b47c077ccf1c5 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -286,10 +286,11 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-SD-NEXT: mov s1, v0.s[1]
-; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: str h1, [sp, #14]
+; CHECK-SD-NEXT: mov s0, v0.s[1]
+; CHECK-SD-NEXT: str h0, [sp, #12]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
@@ -491,10 +492,8 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: dup v1.2s, v0.s[0]
+; CHECK-SD-NEXT: str h0, [sp, #14]
; CHECK-SD-NEXT: str h0, [sp, #12]
-; CHECK-SD-NEXT: mov s1, v1.s[1]
-; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index bcd92f81911b2..cae8d6e3deaeb 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -107,6 +107,7 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: ldr r2, [sp, #48]
+; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vqadd.u32 q0, q0, r1
; CHECK-NEXT: ldr r1, [sp, #52]
; CHECK-NEXT: vcmp.u32 hi, q3, q0
@@ -119,12 +120,9 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
; CHECK-NEXT: ldr r1, [sp, #24]
; CHECK-NEXT: vmov q1[2], q1[0], r2, r1
; CHECK-NEXT: vpsel q0, q1, q0
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: strd r3, r2, [r0, #16]
-; CHECK-NEXT: str r1, [r0, #24]
+; CHECK-NEXT: vmov r1, r2, d0
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: stm r0!, {r1, r2, r3}
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
index 37f6bbeffd027..de508e67a7a77 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
@@ -31,24 +31,19 @@ entry:
define arm_aapcs_vfpcc <4 x i16> @complex_add_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: complex_add_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev64.32 q2, q0
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vrev64.32 q3, q1
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: subs r0, r1, r0
-; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: vmov r12, r1, d1
+; CHECK-NEXT: vmov r2, lr, d3
+; CHECK-NEXT: vmov r3, r4, d2
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov r0, s14
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: add r0, r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: add r1, r2
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: vmov r2, r0, d0
+; CHECK-NEXT: subs r0, r3, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT: add.w r0, lr, r12
+; CHECK-NEXT: adds r1, r4, r2
+; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: pop {r4, pc}
entry:
%a.real = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
%a.imag = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
index 794894def9265..e11b3c773adf6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
@@ -31,24 +31,19 @@ entry:
define arm_aapcs_vfpcc <4 x i8> @complex_add_v4i8(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: complex_add_v4i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev64.32 q2, q0
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vrev64.32 q3, q1
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: subs r0, r1, r0
-; CHECK-NEXT: vmov r1, s8
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: vmov r12, r1, d1
+; CHECK-NEXT: vmov r2, lr, d3
+; CHECK-NEXT: vmov r3, r4, d2
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
-; CHECK-NEXT: vmov r0, s14
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: add r0, r1
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: add r1, r2
-; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: vmov r2, r0, d0
+; CHECK-NEXT: subs r0, r3, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT: add.w r0, lr, r12
+; CHECK-NEXT: adds r1, r4, r2
+; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT: pop {r4, pc}
entry:
%a.real = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> <i32 0, i32 2>
%a.imag = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index 77548b49d77f2..d535c64289d4f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -185,11 +185,10 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f32_v6i32(<6 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q0, q0
-; CHECK-MVEFP-NEXT: vmov.f32 s6, s5
-; CHECK-MVEFP-NEXT: vmov r2, s4
-; CHECK-MVEFP-NEXT: vmov r1, s6
-; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
+; CHECK-MVEFP-NEXT: vmov r1, r2, d2
+; CHECK-MVEFP-NEXT: str r2, [r0, #20]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
+; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <6 x i32> @llvm.fptosi.sat.v6f32.v6i32(<6 x float> %f)
ret <6 x i32> %x
@@ -221,13 +220,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f32_v7i32(<7 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q0, q0
-; CHECK-MVEFP-NEXT: vmov.f32 s10, s5
-; CHECK-MVEFP-NEXT: vmov r2, s4
; CHECK-MVEFP-NEXT: vmov r3, s6
-; CHECK-MVEFP-NEXT: vmov r1, s10
-; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
-; CHECK-MVEFP-NEXT: str r3, [r0, #24]
+; CHECK-MVEFP-NEXT: vmov r1, r2, d2
+; CHECK-MVEFP-NEXT: strd r2, r3, [r0, #20]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
+; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <7 x i32> @llvm.fptosi.sat.v7f32.v7i32(<7 x float> %f)
ret <7 x i32> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index ee040feca4240..61f05347d511d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -172,11 +172,10 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f32_v6i32(<6 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q0, q0
-; CHECK-MVEFP-NEXT: vmov.f32 s6, s5
-; CHECK-MVEFP-NEXT: vmov r2, s4
-; CHECK-MVEFP-NEXT: vmov r1, s6
-; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
+; CHECK-MVEFP-NEXT: vmov r1, r2, d2
+; CHECK-MVEFP-NEXT: str r2, [r0, #20]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
+; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <6 x i32> @llvm.fptoui.sat.v6f32.v6i32(<6 x float> %f)
ret <6 x i32> %x
@@ -208,13 +207,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f32_v7i32(<7 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q0, q0
-; CHECK-MVEFP-NEXT: vmov.f32 s10, s5
-; CHECK-MVEFP-NEXT: vmov r2, s4
; CHECK-MVEFP-NEXT: vmov r3, s6
-; CHECK-MVEFP-NEXT: vmov r1, s10
-; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
-; CHECK-MVEFP-NEXT: str r3, [r0, #24]
+; CHECK-MVEFP-NEXT: vmov r1, r2, d2
+; CHECK-MVEFP-NEXT: strd r2, r3, [r0, #20]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
+; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <7 x i32> @llvm.fptoui.sat.v7f32.v7i32(<7 x float> %f)
ret <7 x i32> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index 7be08b04c5957..0f71653afa408 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -4,54 +4,45 @@
define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) {
; CHECK-LABEL: loads_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
-; CHECK-NEXT: vldrw.u32 q2, [r1]
-; CHECK-NEXT: vmov.i64 q1, #0xffffffff
-; CHECK-NEXT: vmov.f32 s0, s10
-; CHECK-NEXT: vmov.f32 s2, s11
-; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov.f32 s10, s9
-; CHECK-NEXT: vmov r1, r3, d0
-; CHECK-NEXT: vand q2, q2, q1
-; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov.i64 q2, #0xffffffff
+; CHECK-NEXT: vmov.f32 s0, s12
+; CHECK-NEXT: vmov.f32 s2, s13
+; CHECK-NEXT: vmov lr, r0, d2
+; CHECK-NEXT: vand q0, q0, q2
+; CHECK-NEXT: vmov r1, r5, d1
+; CHECK-NEXT: vmov.f32 s12, s14
+; CHECK-NEXT: vmov.f32 s14, s15
+; CHECK-NEXT: vand q2, q3, q2
+; CHECK-NEXT: vmov r4, r3, d5
+; CHECK-NEXT: asrs r6, r0, #31
+; CHECK-NEXT: adds.w r12, r0, r1
+; CHECK-NEXT: adc.w r1, r6, r5
+; CHECK-NEXT: vmov r6, r5, d3
; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: vmov lr, r12, d5
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov r0, s12
-; CHECK-NEXT: vmov.f32 s12, s6
-; CHECK-NEXT: vmov.f32 s6, s7
-; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: adds r0, r0, r1
-; CHECK-NEXT: adc.w r1, r2, r3
-; CHECK-NEXT: vmov r2, s12
-; CHECK-NEXT: asrl r0, r1, r2
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: adds r2, r1, r4
-; CHECK-NEXT: asr.w r3, r1, #31
-; CHECK-NEXT: adc.w r1, r3, r5
-; CHECK-NEXT: vmov r3, s6
-; CHECK-NEXT: asrl r2, r1, r3
-; CHECK-NEXT: vmov r4, r5, d4
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov.f32 s2, s5
-; CHECK-NEXT: adds.w r6, r1, lr
-; CHECK-NEXT: asr.w r3, r1, #31
-; CHECK-NEXT: adc.w r1, r3, r12
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: asrl r6, r1, r3
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: adds r4, r4, r1
-; CHECK-NEXT: asr.w r3, r1, #31
-; CHECK-NEXT: adc.w r1, r3, r5
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: asrl r4, r1, r3
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: vmov r2, r8, d3
+; CHECK-NEXT: adds r0, r5, r4
+; CHECK-NEXT: asr.w r4, r5, #31
+; CHECK-NEXT: adc.w r5, r4, r3
+; CHECK-NEXT: vmov r4, r7, d4
+; CHECK-NEXT: asrs r3, r6, #31
+; CHECK-NEXT: asrl r0, r5, r8
+; CHECK-NEXT: adds r4, r4, r6
+; CHECK-NEXT: adcs r3, r7
+; CHECK-NEXT: asrl r4, r3, r2
+; CHECK-NEXT: asr.w r2, lr, #31
+; CHECK-NEXT: vmov r3, r7, d0
+; CHECK-NEXT: adds.w r6, lr, r3
+; CHECK-NEXT: adc.w r3, r2, r7
+; CHECK-NEXT: vmov r2, r7, d2
+; CHECK-NEXT: asrl r6, r3, r2
+; CHECK-NEXT: asrl r12, r1, r7
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
+; CHECK-NEXT: vmov q0[3], q0[1], r12, r0
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
%b = load <4 x i32>, ptr %B, align 4
@@ -136,55 +127,42 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vmov.i64 q0, #0xffffffff
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s10, s7
+; CHECK-NEXT: vand q2, q2, q0
+; CHECK-NEXT: vmov r5, r0, d7
+; CHECK-NEXT: vmov r1, r7, d5
+; CHECK-NEXT: vmov r12, lr, d4
+; CHECK-NEXT: vldrw.u32 q2, [r2]
; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vand q4, q2, q0
-; CHECK-NEXT: vand q2, q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vmov r4, r5, d9
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov lr, r12, d8
-; CHECK-NEXT: vmov.f32 s16, s6
-; CHECK-NEXT: vmov.f32 s6, s7
-; CHECK-NEXT: vmov r6, r1, d5
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: vmov.f32 s2, s5
-; CHECK-NEXT: adds.w r8, r0, r4
+; CHECK-NEXT: vand q0, q1, q0
+; CHECK-NEXT: adds.w r8, r0, r1
; CHECK-NEXT: asr.w r2, r0, #31
-; CHECK-NEXT: adcs r5, r2
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: asrl r8, r5, r2
-; CHECK-NEXT: vmov r2, s10
-; CHECK-NEXT: vmov r5, r7, d4
-; CHECK-NEXT: asrs r4, r2, #31
-; CHECK-NEXT: adds r2, r2, r6
-; CHECK-NEXT: adcs r1, r4
-; CHECK-NEXT: vmov r4, s2
-; CHECK-NEXT: asrl r2, r1, r4
-; CHECK-NEXT: vmov r1, s12
-; CHECK-NEXT: adds.w r6, r1, lr
-; CHECK-NEXT: asr.w r4, r1, #31
-; CHECK-NEXT: adc.w r1, r4, r12
-; CHECK-NEXT: vmov r4, s16
-; CHECK-NEXT: asrl r6, r1, r4
-; CHECK-NEXT: vmov r1, s0
-; CHECK-NEXT: adds r0, r1, r5
-; CHECK-NEXT: asr.w r4, r1, #31
-; CHECK-NEXT: adc.w r1, r4, r7
-; CHECK-NEXT: vmov r7, s4
-; CHECK-NEXT: asrl r0, r1, r7
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r6
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r8
-; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: adcs r7, r2
+; CHECK-NEXT: asrs r4, r5, #31
+; CHECK-NEXT: adds.w r2, r5, r12
+; CHECK-NEXT: vmov r6, r1, d6
+; CHECK-NEXT: adc.w r5, r4, lr
+; CHECK-NEXT: vmov r4, r12, d5
+; CHECK-NEXT: asrl r2, r5, r4
+; CHECK-NEXT: asrl r8, r7, r12
+; CHECK-NEXT: vmov r5, r4, d0
+; CHECK-NEXT: asrs r7, r1, #31
+; CHECK-NEXT: adds r0, r6, r5
+; CHECK-NEXT: asr.w r6, r6, #31
+; CHECK-NEXT: adc.w r5, r6, r4
+; CHECK-NEXT: vmov r6, r4, d4
+; CHECK-NEXT: asrl r0, r5, r6
+; CHECK-NEXT: vmov q1[2], q1[0], r0, r2
+; CHECK-NEXT: vmov r0, r2, d1
+; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: adc.w r1, r7, r2
+; CHECK-NEXT: asrl r0, r1, r4
+; CHECK-NEXT: vmov q1[3], q1[1], r0, r8
+; CHECK-NEXT: vstrw.32 q1, [r3]
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
@@ -268,36 +246,31 @@ entry:
define arm_aapcs_vfpcc void @load_one_store_i32(ptr %A, ptr %D) {
; CHECK-LABEL: load_one_store_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r9, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: adds.w r12, r2, r2
-; CHECK-NEXT: asr.w r3, r2, #31
-; CHECK-NEXT: adc.w r3, r3, r2, asr #31
-; CHECK-NEXT: asrl r12, r3, r2
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: adds r2, r3, r3
-; CHECK-NEXT: asr.w r0, r3, #31
-; CHECK-NEXT: adc.w r5, r0, r3, asr #31
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: asrl r2, r5, r3
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: vmov r5, r0, d0
+; CHECK-NEXT: adds r6, r3, r3
+; CHECK-NEXT: asr.w r12, r3, #31
+; CHECK-NEXT: adc.w r9, r12, r3, asr #31
+; CHECK-NEXT: adds r4, r2, r2
+; CHECK-NEXT: asr.w r12, r2, #31
+; CHECK-NEXT: adc.w r7, r12, r2, asr #31
+; CHECK-NEXT: asrl r6, r9, r3
+; CHECK-NEXT: asrl r4, r7, r2
+; CHECK-NEXT: adds r2, r5, r5
+; CHECK-NEXT: asr.w r7, r5, #31
+; CHECK-NEXT: adc.w r7, r7, r5, asr #31
+; CHECK-NEXT: asrl r2, r7, r5
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r4
; CHECK-NEXT: adds r4, r0, r0
-; CHECK-NEXT: asr.w r3, r0, #31
-; CHECK-NEXT: adc.w r3, r3, r0, asr #31
+; CHECK-NEXT: asr.w r2, r0, #31
+; CHECK-NEXT: adc.w r3, r2, r0, asr #31
; CHECK-NEXT: asrl r4, r3, r0
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: adds r6, r0, r0
-; CHECK-NEXT: asr.w r3, r0, #31
-; CHECK-NEXT: adc.w r3, r3, r0, asr #31
-; CHECK-NEXT: asrl r6, r3, r0
-; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r12
+; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
%sa = sext <4 x i32> %a to <4 x i64>
@@ -360,34 +333,30 @@ entry:
define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) {
; CHECK-LABEL: mul_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: ldr.w lr, [sp, #20]
-; CHECK-NEXT: vmov.f32 s10, s1
-; CHECK-NEXT: vmov.f32 s14, s5
-; CHECK-NEXT: vmov r5, s4
-; CHECK-NEXT: vmov.f32 s4, s6
-; CHECK-NEXT: vmov.f32 s6, s7
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: vmov r1, s14
-; CHECK-NEXT: smull r12, r3, r1, r0
-; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: ldr.w r12, [sp, #24]
+; CHECK-NEXT: vmov r3, lr, d0
+; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vmov.f32 s4, s6
+; CHECK-NEXT: vmov.f32 s6, s7
; CHECK-NEXT: vmullb.s32 q2, q1, q0
-; CHECK-NEXT: asrl r12, r3, r2
-; CHECK-NEXT: vmov r6, r1, d4
-; CHECK-NEXT: vmov r4, r7, d5
+; CHECK-NEXT: vmov r4, r5, d5
+; CHECK-NEXT: asrl r4, r5, r2
+; CHECK-NEXT: smull r8, r3, r0, r3
+; CHECK-NEXT: vmov r0, r7, d4
+; CHECK-NEXT: asrl r0, r7, r2
+; CHECK-NEXT: smull r6, r1, r1, lr
+; CHECK-NEXT: asrl r8, r3, r2
+; CHECK-NEXT: vmov q0[2], q0[0], r8, r0
; CHECK-NEXT: asrl r6, r1, r2
-; CHECK-NEXT: asrl r4, r7, r2
-; CHECK-NEXT: smull r0, r5, r5, r0
-; CHECK-NEXT: asrl r0, r5, r2
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r6
-; CHECK-NEXT: vmov q0[3], q0[1], r12, r4
-; CHECK-NEXT: vstrw.32 q0, [lr]
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: vmov q0[3], q0[1], r6, r4
+; CHECK-NEXT: vstrw.32 q0, [r12]
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
%b = load <4 x i32>, ptr %B, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index acbe48f9e5927..e3a6ec81aae80 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -64,27 +64,19 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_add_trunc_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmov.f32 s6, s7
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vmov.f32 s8, s2
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov r1, s8
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: vmov.f32 s2, s5
-; CHECK-NEXT: add.w r12, r1, r0
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r0, s0
-; CHECK-NEXT: add r1, r2
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: add r2, r3
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: add r0, r3
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r12
-; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vmov lr, r12, d3
+; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: vmov r1, r0, d2
+; CHECK-NEXT: vmov r4, r5, d0
+; CHECK-NEXT: add r2, lr
+; CHECK-NEXT: add r3, r12
+; CHECK-NEXT: add r1, r4
+; CHECK-NEXT: add r0, r5
+; CHECK-NEXT: vmov q0[2], q0[0], r1, r2
+; CHECK-NEXT: vmov q0[3], q0[1], r0, r3
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
@@ -180,44 +172,40 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_add_ashr_trunc_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, lr}
-; CHECK-NEXT: vmov.f32 s12, s6
-; CHECK-NEXT: vmov.i64 q2, #0xffffffff
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov.f32 s14, s7
-; CHECK-NEXT: vand q1, q1, q2
+; CHECK-NEXT: .save {r4, r5, r6, r7, r9, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, lr}
+; CHECK-NEXT: vmov.f32 s8, s4
+; CHECK-NEXT: vmov.i64 q3, #0xffffffff
+; CHECK-NEXT: vmov.f32 s10, s5
+; CHECK-NEXT: vmov r12, r0, d0
+; CHECK-NEXT: vand q2, q2, q3
+; CHECK-NEXT: vmov r1, lr, d5
+; CHECK-NEXT: vmov.f32 s4, s6
+; CHECK-NEXT: vmov.f32 s6, s7
+; CHECK-NEXT: vand q1, q1, q3
+; CHECK-NEXT: vmov r3, r5, d3
+; CHECK-NEXT: asrs r4, r0, #31
+; CHECK-NEXT: adds r6, r0, r1
+; CHECK-NEXT: adc.w r9, r4, lr
+; CHECK-NEXT: vmov r4, r1, d1
+; CHECK-NEXT: asrl r6, r9, #1
+; CHECK-NEXT: adds r0, r1, r3
; CHECK-NEXT: vmov r3, r7, d2
-; CHECK-NEXT: vand q3, q3, q2
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov r0, r1, d6
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov lr, r12, d7
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: asrs r5, r2, #31
-; CHECK-NEXT: adds r2, r2, r0
-; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: adcs r1, r5
-; CHECK-NEXT: vmov r5, s0
+; CHECK-NEXT: asr.w r1, r1, #31
+; CHECK-NEXT: adcs r5, r1
+; CHECK-NEXT: asrs r1, r4, #31
+; CHECK-NEXT: asrl r0, r5, #1
+; CHECK-NEXT: adds r4, r4, r3
+; CHECK-NEXT: adcs r1, r7
+; CHECK-NEXT: vmov r3, r7, d4
+; CHECK-NEXT: asrl r4, r1, #1
+; CHECK-NEXT: asr.w r1, r12, #31
+; CHECK-NEXT: adds.w r2, r12, r3
+; CHECK-NEXT: adcs r1, r7
; CHECK-NEXT: asrl r2, r1, #1
-; CHECK-NEXT: asrs r1, r0, #31
-; CHECK-NEXT: adds.w r0, r0, lr
-; CHECK-NEXT: adc.w r1, r1, r12
-; CHECK-NEXT: asrs r4, r5, #31
-; CHECK-NEXT: adds r6, r5, r3
-; CHECK-NEXT: vmov r3, r5, d3
-; CHECK-NEXT: vmov.f32 s6, s1
-; CHECK-NEXT: asrl r0, r1, #1
-; CHECK-NEXT: adcs r7, r4
-; CHECK-NEXT: asrl r6, r7, #1
-; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: adds r6, r1, r3
-; CHECK-NEXT: asr.w r2, r1, #31
-; CHECK-NEXT: adc.w r1, r2, r5
-; CHECK-NEXT: asrl r6, r1, #1
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r4
; CHECK-NEXT: vmov q0[3], q0[1], r6, r0
-; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
@@ -300,95 +288,87 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: ext_ops_trunc_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: vmov.f32 s8, s2
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov r10, s8
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT: vmov r3, lr, d1
+; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: vmov r1, r2, d3
; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s10, s7
; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: asr.w r0, r10, #31
-; CHECK-NEXT: adds.w r6, r10, r2
-; CHECK-NEXT: eor.w r7, r10, r2
-; CHECK-NEXT: adc r3, r0, #0
-; CHECK-NEXT: asrl r6, r3, r2
+; CHECK-NEXT: adds.w r6, lr, r2
+; CHECK-NEXT: asr.w r0, lr, #31
+; CHECK-NEXT: adc r5, r0, #0
+; CHECK-NEXT: eor.w r7, r3, r1
+; CHECK-NEXT: asrl r6, r5, r2
; CHECK-NEXT: subs r0, r6, r2
-; CHECK-NEXT: vmov r6, s2
-; CHECK-NEXT: sbc lr, r3, #0
-; CHECK-NEXT: vmov r3, s10
-; CHECK-NEXT: vmov.f32 s2, s1
-; CHECK-NEXT: umull r0, r8, r0, r2
-; CHECK-NEXT: asrs r5, r6, #31
-; CHECK-NEXT: adds r4, r6, r3
+; CHECK-NEXT: sbc r8, r5, #0
+; CHECK-NEXT: asrs r5, r3, #31
+; CHECK-NEXT: adds r4, r3, r1
+; CHECK-NEXT: umull r0, r9, r0, r2
; CHECK-NEXT: adc r5, r5, #0
-; CHECK-NEXT: eor.w r1, r6, r3
-; CHECK-NEXT: asrl r4, r5, r3
-; CHECK-NEXT: subs r4, r4, r3
+; CHECK-NEXT: asrl r4, r5, r1
+; CHECK-NEXT: subs r4, r4, r1
; CHECK-NEXT: sbc r5, r5, #0
-; CHECK-NEXT: orrs.w r7, r7, r10, asr #31
-; CHECK-NEXT: umull r4, r12, r4, r3
-; CHECK-NEXT: csetm r9, eq
-; CHECK-NEXT: orrs.w r1, r1, r6, asr #31
+; CHECK-NEXT: orrs.w r7, r7, r3, asr #31
+; CHECK-NEXT: umull r4, r6, r4, r1
; CHECK-NEXT: mov.w r7, #0
-; CHECK-NEXT: csetm r1, eq
-; CHECK-NEXT: bfi r7, r9, #0, #8
-; CHECK-NEXT: mla r5, r5, r3, r12
-; CHECK-NEXT: bfi r7, r1, #8, #8
-; CHECK-NEXT: rsbs r1, r6, #0
-; CHECK-NEXT: vmsr p0, r7
-; CHECK-NEXT: mla r7, lr, r2, r8
-; CHECK-NEXT: lsll r4, r5, r1
-; CHECK-NEXT: rsb.w r1, r10, #0
+; CHECK-NEXT: rsb.w r3, r3, #0
+; CHECK-NEXT: mla r5, r5, r1, r6
+; CHECK-NEXT: csetm r6, eq
+; CHECK-NEXT: bfi r7, r6, #0, #8
+; CHECK-NEXT: eor.w r6, lr, r2
; CHECK-NEXT: lsll r4, r5, r3
-; CHECK-NEXT: lsll r0, r7, r1
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: orrs.w r6, r6, lr, asr #31
+; CHECK-NEXT: rsb.w r3, lr, #0
+; CHECK-NEXT: csetm r6, eq
+; CHECK-NEXT: lsll r4, r5, r1
+; CHECK-NEXT: bfi r7, r6, #8, #8
+; CHECK-NEXT: vmsr p0, r7
+; CHECK-NEXT: mla r7, r8, r2, r9
+; CHECK-NEXT: lsll r0, r7, r3
; CHECK-NEXT: lsll r0, r7, r2
-; CHECK-NEXT: vmov q3[2], q3[0], r0, r4
-; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: vmov r1, r2, d2
+; CHECK-NEXT: vmov q3[2], q3[0], r4, r0
+; CHECK-NEXT: vmov r4, r3, d0
; CHECK-NEXT: vpsel q2, q3, q2
-; CHECK-NEXT: adds r2, r3, r1
-; CHECK-NEXT: asr.w r0, r3, #31
+; CHECK-NEXT: adds r6, r4, r1
+; CHECK-NEXT: asr.w r0, r4, #31
; CHECK-NEXT: adc r5, r0, #0
-; CHECK-NEXT: asrl r2, r5, r1
-; CHECK-NEXT: subs r0, r2, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: sbc r8, r5, #0
-; CHECK-NEXT: umull r4, lr, r0, r1
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: asrs r5, r2, #31
-; CHECK-NEXT: adds r6, r2, r0
+; CHECK-NEXT: asrl r6, r5, r1
+; CHECK-NEXT: subs r0, r6, r1
+; CHECK-NEXT: sbc lr, r5, #0
+; CHECK-NEXT: asrs r5, r3, #31
+; CHECK-NEXT: adds r6, r3, r2
; CHECK-NEXT: adc r7, r5, #0
-; CHECK-NEXT: mla r5, r8, r1, lr
-; CHECK-NEXT: asrl r6, r7, r0
-; CHECK-NEXT: subs.w r8, r6, r0
-; CHECK-NEXT: eor.w r6, r2, r0
+; CHECK-NEXT: umull r0, r5, r0, r1
+; CHECK-NEXT: asrl r6, r7, r2
+; CHECK-NEXT: subs r6, r6, r2
+; CHECK-NEXT: mla r5, lr, r1, r5
; CHECK-NEXT: sbc lr, r7, #0
-; CHECK-NEXT: eor.w r7, r3, r1
-; CHECK-NEXT: orrs.w r6, r6, r2, asr #31
-; CHECK-NEXT: orr.w r7, r7, r3, asr #31
-; CHECK-NEXT: csetm r6, eq
-; CHECK-NEXT: cmp r7, #0
-; CHECK-NEXT: bfi r12, r6, #0, #8
-; CHECK-NEXT: csetm r6, eq
-; CHECK-NEXT: bfi r12, r6, #8, #8
-; CHECK-NEXT: umull r6, r7, r8, r0
-; CHECK-NEXT: rsb.w r8, r3, #0
-; CHECK-NEXT: lsll r4, r5, r8
+; CHECK-NEXT: eor.w r7, r4, r1
+; CHECK-NEXT: orrs.w r7, r7, r4, asr #31
+; CHECK-NEXT: umull r6, r8, r6, r2
+; CHECK-NEXT: csetm r7, eq
+; CHECK-NEXT: rsbs r4, r4, #0
+; CHECK-NEXT: bfi r12, r7, #0, #8
+; CHECK-NEXT: lsll r0, r5, r4
+; CHECK-NEXT: eor.w r7, r3, r2
+; CHECK-NEXT: lsll r0, r5, r1
+; CHECK-NEXT: orrs.w r7, r7, r3, asr #31
+; CHECK-NEXT: rsb.w r3, r3, #0
+; CHECK-NEXT: csetm r7, eq
+; CHECK-NEXT: bfi r12, r7, #8, #8
+; CHECK-NEXT: mla r7, lr, r2, r8
; CHECK-NEXT: vmsr p0, r12
-; CHECK-NEXT: mla r3, lr, r0, r7
-; CHECK-NEXT: lsll r4, r5, r1
-; CHECK-NEXT: rsbs r1, r2, #0
-; CHECK-NEXT: lsll r6, r3, r1
-; CHECK-NEXT: lsll r6, r3, r0
-; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
+; CHECK-NEXT: lsll r6, r7, r3
+; CHECK-NEXT: lsll r6, r7, r2
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r6
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vmov.f32 s1, s2
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vmov.f32 s3, s10
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
entry:
%sa = sext <4 x i32> %a to <4 x i64>
%sb = zext <4 x i32> %b to <4 x i64>
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 29b56639bd769..e69cb2b699082 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -225,22 +225,20 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: mov.w r2, #-1
; CHECK-NEXT: vmov.f32 s16, s10
; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: vmov.f32 s20, s14
-; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: vmov.f32 s18, s11
+; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: vmov.f32 s20, s14
+; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vmov.f32 s22, s15
; CHECK-NEXT: vmullb.s32 q6, q5, q4
-; CHECK-NEXT: vmov.f32 s14, s13
; CHECK-NEXT: vmov r4, r7, d12
; CHECK-NEXT: asrl r4, r7, #31
-; CHECK-NEXT: vmov.f32 s10, s9
; CHECK-NEXT: rsbs.w r5, r4, #-2147483648
; CHECK-NEXT: sbcs.w r5, r2, r7
; CHECK-NEXT: csetm r5, lt
; CHECK-NEXT: bfi r8, r5, #0, #8
; CHECK-NEXT: vmov r10, r5, d13
; CHECK-NEXT: asrl r10, r5, #31
-; CHECK-NEXT: vmov r6, s14
; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
; CHECK-NEXT: vmov q4[2], q4[0], r4, r10
; CHECK-NEXT: sbcs.w r3, r2, r5
@@ -259,30 +257,28 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
; CHECK-NEXT: vmov r3, r5, d9
; CHECK-NEXT: subs.w r3, r3, r8
; CHECK-NEXT: sbcs r3, r5, #0
-; CHECK-NEXT: mov.w r5, #0
; CHECK-NEXT: csetm r3, lt
; CHECK-NEXT: bfi r4, r3, #8, #8
-; CHECK-NEXT: vmov r3, s8
+; CHECK-NEXT: vmov r3, r5, d4
; CHECK-NEXT: vmsr p0, r4
-; CHECK-NEXT: vmov r4, s12
+; CHECK-NEXT: vmov r4, r6, d6
; CHECK-NEXT: vpsel q4, q4, q1
; CHECK-NEXT: smull r4, r7, r4, r3
; CHECK-NEXT: asrl r4, r7, #31
; CHECK-NEXT: rsbs.w r3, r4, #-2147483648
; CHECK-NEXT: sbcs.w r3, r2, r7
; CHECK-NEXT: csetm r3, lt
-; CHECK-NEXT: bfi r5, r3, #0, #8
-; CHECK-NEXT: vmov r3, s10
-; CHECK-NEXT: smull r6, r3, r6, r3
+; CHECK-NEXT: bfi r1, r3, #0, #8
+; CHECK-NEXT: smull r6, r3, r6, r5
; CHECK-NEXT: asrl r6, r3, #31
-; CHECK-NEXT: rsbs.w r1, r6, #-2147483648
+; CHECK-NEXT: rsbs.w r5, r6, #-2147483648
; CHECK-NEXT: vmov q2[2], q2[0], r4, r6
-; CHECK-NEXT: sbcs.w r1, r2, r3
+; CHECK-NEXT: sbcs.w r5, r2, r3
; CHECK-NEXT: vmov q2[3], q2[1], r7, r3
-; CHECK-NEXT: csetm r1, lt
-; CHECK-NEXT: bfi r5, r1, #8, #8
-; CHECK-NEXT: vmsr p0, r5
+; CHECK-NEXT: csetm r5, lt
+; CHECK-NEXT: bfi r1, r5, #8, #8
; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpsel q2, q2, q0
; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: subs.w r1, r1, r8
@@ -464,7 +460,6 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
; CHECK-NEXT: vmov.f32 s28, s22
; CHECK-NEXT: vmov.f32 s30, s23
; CHECK-NEXT: vmullb.s32 q0, q7, q6
-; CHECK-NEXT: vmov.f32 s18, s21
; CHECK-NEXT: vmov r10, r5, d0
; CHECK-NEXT: asrl r10, r5, #31
; CHECK-NEXT: rsbs.w r7, r10, #-2147483648
@@ -478,7 +473,6 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
; CHECK-NEXT: sbcs.w r3, r12, r7
; CHECK-NEXT: vmov q0[3], q0[1], r5, r7
; CHECK-NEXT: csetm r3, lt
-; CHECK-NEXT: vmov r7, s18
; CHECK-NEXT: bfi r4, r3, #8, #8
; CHECK-NEXT: vmsr p0, r4
; CHECK-NEXT: vpsel q0, q0, q2
@@ -491,25 +485,23 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
; CHECK-NEXT: vmov r3, r5, d1
; CHECK-NEXT: subs.w r3, r3, r8
; CHECK-NEXT: sbcs r3, r5, #0
+; CHECK-NEXT: vmov r5, r7, d10
; CHECK-NEXT: csetm r3, lt
; CHECK-NEXT: bfi r4, r3, #8, #8
-; CHECK-NEXT: vmov r3, s16
+; CHECK-NEXT: vmov r3, r10, d8
; CHECK-NEXT: vmsr p0, r4
-; CHECK-NEXT: vmov r4, s20
-; CHECK-NEXT: vpsel q6, q0, q3
-; CHECK-NEXT: vmov.f32 s2, s17
-; CHECK-NEXT: smull r10, r5, r4, r3
; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: asrl r10, r5, #31
-; CHECK-NEXT: rsbs.w r3, r10, #-2147483648
+; CHECK-NEXT: vpsel q6, q0, q3
+; CHECK-NEXT: smull r6, r5, r5, r3
+; CHECK-NEXT: asrl r6, r5, #31
+; CHECK-NEXT: rsbs.w r3, r6, #-2147483648
; CHECK-NEXT: sbcs.w r3, r12, r5
; CHECK-NEXT: csetm r3, lt
; CHECK-NEXT: bfi r4, r3, #0, #8
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: smull r6, r3, r7, r3
-; CHECK-NEXT: asrl r6, r3, #31
-; CHECK-NEXT: rsbs.w r7, r6, #-2147483648
-; CHECK-NEXT: vmov q0[2], q0[0], r10, r6
+; CHECK-NEXT: smull r10, r3, r7, r10
+; CHECK-NEXT: asrl r10, r3, #31
+; CHECK-NEXT: rsbs.w r7, r10, #-2147483648
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r10
; CHECK-NEXT: sbcs.w r7, r12, r3
; CHECK-NEXT: vmov q0[3], q0[1], r5, r3
; CHECK-NEXT: csetm r7, lt
diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
index 05f438acc3a7e..601390860b830 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
@@ -51,41 +51,36 @@ entry:
define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) {
; CHECK-LABEL: foo_v4i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vpt.s32 lt, q0, zr
; CHECK-NEXT: vldrwt.u32 q5, [r0]
-; CHECK-NEXT: vmov.f32 s2, s23
-; CHECK-NEXT: vmov.f32 s16, s22
-; CHECK-NEXT: vmov r0, s2
+; CHECK-NEXT: vmov r4, r0, d10
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: bl __aeabi_l2d
-; CHECK-NEXT: vmov r2, s16
+; CHECK-NEXT: asrs r2, r4, #31
; CHECK-NEXT: vmov d9, r0, r1
-; CHECK-NEXT: asrs r3, r2, #31
-; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: bl __aeabi_l2d
-; CHECK-NEXT: vmov.f32 s2, s21
+; CHECK-NEXT: vmov r4, r2, d11
; CHECK-NEXT: vmov d8, r0, r1
-; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrs r3, r2, #31
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bl __aeabi_l2d
-; CHECK-NEXT: vmov r2, s20
+; CHECK-NEXT: asrs r2, r4, #31
; CHECK-NEXT: vmov d11, r0, r1
-; CHECK-NEXT: asrs r3, r2, #31
-; CHECK-NEXT: mov r0, r2
-; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: mov r0, r4
+; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: bl __aeabi_l2d
; CHECK-NEXT: vmov d10, r0, r1
-; CHECK-NEXT: vmov q1, q4
-; CHECK-NEXT: vmov q0, q5
+; CHECK-NEXT: vmov q0, q4
+; CHECK-NEXT: vmov q1, q5
; CHECK-NEXT: vpop {d8, d9, d10, d11}
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index 042a6ea18412a..a7e927bce16d7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -367,58 +367,48 @@ for.cond.cleanup: ; preds = %vector.body
define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) {
; CHECK-LABEL: vabd_loop_s32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: mov.w lr, #256
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: .LBB17_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
-; CHECK-NEXT: vldrw.u32 q3, [r1], #16
-; CHECK-NEXT: vmov.f32 s8, s6
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r7, s4
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov r3, s8
-; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s14, s15
-; CHECK-NEXT: vmov r4, s8
-; CHECK-NEXT: asr.w r12, r3, #31
-; CHECK-NEXT: subs.w r8, r3, r4
-; CHECK-NEXT: sbc.w r12, r12, r4, asr #31
-; CHECK-NEXT: vmov r4, s6
-; CHECK-NEXT: vmov.f32 s6, s13
-; CHECK-NEXT: vmov r6, s6
-; CHECK-NEXT: asrs r5, r4, #31
-; CHECK-NEXT: subs.w r9, r4, r6
-; CHECK-NEXT: vmov r4, s10
-; CHECK-NEXT: sbc.w r5, r5, r6, asr #31
-; CHECK-NEXT: vmov r6, s12
-; CHECK-NEXT: asrs r5, r5, #31
-; CHECK-NEXT: subs r3, r7, r6
-; CHECK-NEXT: asr.w r7, r7, #31
-; CHECK-NEXT: vmov q1[2], q1[0], r3, r8
-; CHECK-NEXT: vmov r3, s14
-; CHECK-NEXT: sbc.w r6, r7, r6, asr #31
-; CHECK-NEXT: asrs r6, r6, #31
-; CHECK-NEXT: subs r7, r4, r3
-; CHECK-NEXT: vmov q1[3], q1[1], r9, r7
-; CHECK-NEXT: mov.w r7, #0
-; CHECK-NEXT: bfi r7, r6, #0, #4
-; CHECK-NEXT: asr.w r6, r12, #31
-; CHECK-NEXT: bfi r7, r5, #4, #4
-; CHECK-NEXT: bfi r7, r6, #8, #4
-; CHECK-NEXT: asr.w r6, r4, #31
-; CHECK-NEXT: sbc.w r3, r6, r3, asr #31
+; CHECK-NEXT: vldrw.u32 q2, [r1], #16
+; CHECK-NEXT: vmov r3, r12, d3
+; CHECK-NEXT: vmov r5, r10, d5
+; CHECK-NEXT: subs.w r9, r3, r5
+; CHECK-NEXT: asr.w r4, r3, #31
+; CHECK-NEXT: sbc.w r8, r4, r5, asr #31
+; CHECK-NEXT: vmov r5, r7, d2
+; CHECK-NEXT: vmov r3, r6, d4
+; CHECK-NEXT: asrs r4, r7, #31
+; CHECK-NEXT: subs r7, r7, r6
+; CHECK-NEXT: sbc.w r4, r4, r6, asr #31
+; CHECK-NEXT: subs r6, r5, r3
+; CHECK-NEXT: asr.w r5, r5, #31
+; CHECK-NEXT: sbc.w r3, r5, r3, asr #31
+; CHECK-NEXT: vmov q1[2], q1[0], r6, r9
+; CHECK-NEXT: subs.w r5, r12, r10
+; CHECK-NEXT: vmov q1[3], q1[1], r7, r5
+; CHECK-NEXT: asr.w r3, r3, #31
+; CHECK-NEXT: mov.w r5, #0
+; CHECK-NEXT: bfi r5, r3, #0, #4
+; CHECK-NEXT: asr.w r3, r4, #31
+; CHECK-NEXT: bfi r5, r3, #4, #4
+; CHECK-NEXT: asr.w r3, r8, #31
+; CHECK-NEXT: bfi r5, r3, #8, #4
+; CHECK-NEXT: asr.w r3, r12, #31
+; CHECK-NEXT: sbc.w r3, r3, r10, asr #31
; CHECK-NEXT: asrs r3, r3, #31
-; CHECK-NEXT: bfi r7, r3, #12, #4
-; CHECK-NEXT: vmsr p0, r7
+; CHECK-NEXT: bfi r5, r3, #12, #4
+; CHECK-NEXT: vmsr p0, r5
; CHECK-NEXT: vpst
; CHECK-NEXT: vsubt.i32 q1, q0, q1
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: le lr, .LBB17_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
br label %vector.body
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index dca4fb3d6cfa3..38ab878e2e321 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -7,14 +7,11 @@ define void @vld2_v2i32(ptr %src, ptr %dst) {
; CHECK-LABEL: vld2_v2i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: add r2, r3
-; CHECK-NEXT: strd r2, r0, [r1]
+; CHECK-NEXT: vmov r12, r2, d1
+; CHECK-NEXT: vmov r3, r0, d0
+; CHECK-NEXT: add r2, r12
+; CHECK-NEXT: add r0, r3
+; CHECK-NEXT: strd r0, r2, [r1]
; CHECK-NEXT: bx lr
entry:
%l1 = load <4 x i32>, ptr %src, align 4
@@ -127,14 +124,11 @@ define void @vld2_v2i16(ptr %src, ptr %dst) {
; CHECK-LABEL: vld2_v2i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u32 q0, [r0]
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov r0, r2, d1
+; CHECK-NEXT: vmov r3, r12, d0
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: strh r0, [r1, #2]
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: add.w r0, r3, r12
; CHECK-NEXT: strh r0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -237,14 +231,11 @@ define void @vld2_v2i8(ptr %src, ptr %dst) {
; CHECK-LABEL: vld2_v2i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u32 q0, [r0]
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: vmov r0, s6
+; CHECK-NEXT: vmov r0, r2, d1
+; CHECK-NEXT: vmov r3, r12, d0
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: strb r0, [r1, #1]
-; CHECK-NEXT: vmov r0, s4
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: add.w r0, r3, r12
; CHECK-NEXT: strb r0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -342,43 +333,32 @@ define void @vld2_v4i64(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: .vsave {d8}
-; CHECK-NEXT: vpush {d8}
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov.f32 s5, s3
-; CHECK-NEXT: vmov.f32 s2, s8
-; CHECK-NEXT: vmov.f32 s3, s9
-; CHECK-NEXT: vmov.f32 s16, s14
-; CHECK-NEXT: vmov.f32 s17, s15
-; CHECK-NEXT: vmov lr, r12, d5
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vmov r5, r6, d6
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vmov.f32 s2, s8
-; CHECK-NEXT: vmov r0, r7, d8
-; CHECK-NEXT: vmov.f32 s3, s9
-; CHECK-NEXT: adds.w lr, lr, r2
-; CHECK-NEXT: adc.w r12, r12, r3
-; CHECK-NEXT: vmov r3, r4, d5
-; CHECK-NEXT: adds r0, r0, r5
-; CHECK-NEXT: adc.w r8, r6, r7
-; CHECK-NEXT: vmov r6, r5, d1
-; CHECK-NEXT: vmov r2, r7, d0
-; CHECK-NEXT: adds r3, r3, r6
-; CHECK-NEXT: adc.w r6, r5, r4
-; CHECK-NEXT: vmov r5, r4, d2
-; CHECK-NEXT: vmov q1[2], q1[0], r0, r3
-; CHECK-NEXT: vmov q1[3], q1[1], r8, r6
-; CHECK-NEXT: vstrw.32 q1, [r1, #16]
+; CHECK-NEXT: vmov lr, r12, d1
+; CHECK-NEXT: vmov r3, r2, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r4, r5, d3
+; CHECK-NEXT: vmov r0, r6, d2
+; CHECK-NEXT: adds.w lr, lr, r3
+; CHECK-NEXT: adc.w r12, r12, r2
+; CHECK-NEXT: vmov r2, r3, d5
+; CHECK-NEXT: adds r0, r0, r4
+; CHECK-NEXT: vmov r7, r4, d0
+; CHECK-NEXT: adc.w r8, r6, r5
+; CHECK-NEXT: vmov r5, r6, d4
; CHECK-NEXT: adds r2, r2, r5
-; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: adc.w r0, r7, r4
+; CHECK-NEXT: adcs r3, r6
+; CHECK-NEXT: vmov r5, r6, d1
+; CHECK-NEXT: vmov q1[2], q1[0], r0, r2
+; CHECK-NEXT: vmov q1[3], q1[1], r8, r3
+; CHECK-NEXT: vstrw.32 q1, [r1, #16]
+; CHECK-NEXT: adds r5, r5, r7
+; CHECK-NEXT: vmov q0[2], q0[0], r5, lr
+; CHECK-NEXT: adc.w r0, r4, r6
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%l1 = load <8 x i64>, ptr %src, align 8
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index 4dd9173e2d418..ce28c11d47d0c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -7,20 +7,18 @@
define void @vld3_v2i32(ptr %src, ptr %dst) {
; CHECK-LABEL: vld3_v2i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: ldrd r0, r2, [r0, #16]
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov r12, lr, d0
-; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: ldrd r0, r4, [r0, #16]
+; CHECK-NEXT: vmov r12, r3, d1
+; CHECK-NEXT: vmov r2, lr, d0
; CHECK-NEXT: add r0, r3
-; CHECK-NEXT: add.w r3, r12, lr
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: add r2, r3
+; CHECK-NEXT: add r0, r4
+; CHECK-NEXT: add r2, lr
+; CHECK-NEXT: add r2, r12
; CHECK-NEXT: strd r2, r0, [r1]
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%l1 = load <6 x i32>, ptr %src, align 4
%s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
@@ -353,32 +351,26 @@ entry:
define void @vld3_v2i16(ptr %src, ptr %dst) {
; CHECK-LABEL: vld3_v2i16:
; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
; CHECK-NEXT: vldrh.u32 q0, [r0]
-; CHECK-NEXT: ldr r2, [r0, #8]
-; CHECK-NEXT: mov r3, sp
-; CHECK-NEXT: str r2, [sp]
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov.f32 s8, s1
-; CHECK-NEXT: vmov r0, s6
-; CHECK-NEXT: vldrh.u32 q1, [r3]
-; CHECK-NEXT: vmov.f32 s6, s4
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov.f32 s2, s5
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: vmov r2, s2
-; CHECK-NEXT: add r0, r2
+; CHECK-NEXT: ldr r0, [r0, #8]
+; CHECK-NEXT: mov r6, sp
+; CHECK-NEXT: str r0, [sp]
+; CHECK-NEXT: vmov r3, r2, d0
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: vldrh.u32 q0, [r6]
+; CHECK-NEXT: vmov r0, r6, d0
+; CHECK-NEXT: add r2, r3
+; CHECK-NEXT: add r2, r4
+; CHECK-NEXT: strh r2, [r1]
+; CHECK-NEXT: add r0, r5
+; CHECK-NEXT: add r0, r6
; CHECK-NEXT: strh r0, [r1, #2]
-; CHECK-NEXT: vmov r0, s8
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: vmov r2, s4
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: strh r0, [r1]
; CHECK-NEXT: add sp, #8
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%l1 = load <6 x i16>, ptr %src, align 4
%s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
@@ -935,65 +927,31 @@ entry:
; i64
define void @vld3_v2i64(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v2i64:
-; CHECK-LV: @ %bb.0: @ %entry
-; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-LV-NEXT: vldrw.u32 q0, [r0]
-; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32]
-; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-LV-NEXT: vmov.f32 s12, s2
-; CHECK-LV-NEXT: vmov.f32 s13, s3
-; CHECK-LV-NEXT: vmov.f32 s2, s4
-; CHECK-LV-NEXT: vmov.f32 s3, s5
-; CHECK-LV-NEXT: vmov r0, r3, d5
-; CHECK-LV-NEXT: vmov r2, r4, d3
-; CHECK-LV-NEXT: vmov r6, r7, d0
-; CHECK-LV-NEXT: vmov r5, r8, d6
-; CHECK-LV-NEXT: vmov lr, r12, d1
-; CHECK-LV-NEXT: adds.w r0, r0, lr
-; CHECK-LV-NEXT: adc.w r3, r3, r12
-; CHECK-LV-NEXT: adds r0, r0, r2
-; CHECK-LV-NEXT: adc.w r2, r3, r4
-; CHECK-LV-NEXT: vmov r3, r4, d4
-; CHECK-LV-NEXT: adds r6, r6, r5
-; CHECK-LV-NEXT: adc.w r7, r7, r8
-; CHECK-LV-NEXT: adds r3, r3, r6
-; CHECK-LV-NEXT: adcs r7, r4
-; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, r0
-; CHECK-LV-NEXT: vmov q0[3], q0[1], r7, r2
-; CHECK-LV-NEXT: vstrw.32 q0, [r1]
-; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
-;
-; CHECK-LIS-LABEL: vld3_v2i64:
-; CHECK-LIS: @ %bb.0: @ %entry
-; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-LIS-NEXT: vldrw.u32 q0, [r0]
-; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32]
-; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16]
-; CHECK-LIS-NEXT: vmov.f32 s8, s2
-; CHECK-LIS-NEXT: vmov.f32 s9, s3
-; CHECK-LIS-NEXT: vmov.f32 s2, s4
-; CHECK-LIS-NEXT: vmov.f32 s3, s5
-; CHECK-LIS-NEXT: vmov r0, r3, d7
-; CHECK-LIS-NEXT: vmov r2, r4, d3
-; CHECK-LIS-NEXT: vmov r6, r7, d0
-; CHECK-LIS-NEXT: vmov r5, r8, d4
-; CHECK-LIS-NEXT: vmov lr, r12, d1
-; CHECK-LIS-NEXT: adds.w r0, r0, lr
-; CHECK-LIS-NEXT: adc.w r3, r3, r12
-; CHECK-LIS-NEXT: adds r0, r0, r2
-; CHECK-LIS-NEXT: adc.w r2, r3, r4
-; CHECK-LIS-NEXT: vmov r3, r4, d6
-; CHECK-LIS-NEXT: adds r6, r6, r5
-; CHECK-LIS-NEXT: adc.w r7, r7, r8
-; CHECK-LIS-NEXT: adds r3, r3, r6
-; CHECK-LIS-NEXT: adcs r7, r4
-; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, r0
-; CHECK-LIS-NEXT: vmov q0[3], q0[1], r7, r2
-; CHECK-LIS-NEXT: vstrw.32 q0, [r1]
-; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-LABEL: vld3_v2i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov lr, r12, d0
+; CHECK-NEXT: vmov r3, r2, d3
+; CHECK-NEXT: vmov r4, r7, d1
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r0, r8, d1
+; CHECK-NEXT: vmov r5, r6, d0
+; CHECK-NEXT: adds.w r3, r3, lr
+; CHECK-NEXT: adc.w r2, r2, r12
+; CHECK-NEXT: adds r3, r3, r4
+; CHECK-NEXT: adcs r2, r7
+; CHECK-NEXT: vmov r7, r4, d2
+; CHECK-NEXT: adds r0, r0, r5
+; CHECK-NEXT: adc.w r6, r6, r8
+; CHECK-NEXT: adds r0, r0, r7
+; CHECK-NEXT: adc.w r7, r6, r4
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r3
+; CHECK-NEXT: vmov q0[3], q0[1], r7, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%l1 = load <6 x i64>, ptr %src, align 4
@@ -1007,123 +965,54 @@ entry:
}
define void @vld3_v4i64(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v4i64:
-; CHECK-LV: @ %bb.0: @ %entry
-; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12}
-; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12}
-; CHECK-LV-NEXT: vldrw.u32 q0, [r0]
-; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32]
-; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #48]
-; CHECK-LV-NEXT: vmov.f32 s4, s2
-; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #64]
-; CHECK-LV-NEXT: vmov.f32 s5, s3
-; CHECK-LV-NEXT: vmov.f32 s2, s12
-; CHECK-LV-NEXT: vmov.f32 s3, s13
-; CHECK-LV-NEXT: vmov r2, r3, d5
-; CHECK-LV-NEXT: vmov r4, r8, d7
-; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-LV-NEXT: vmov.f32 s24, s18
-; CHECK-LV-NEXT: vmov.f32 s25, s19
-; CHECK-LV-NEXT: vmov.f32 s6, s22
-; CHECK-LV-NEXT: vmov.f32 s7, s23
-; CHECK-LV-NEXT: vmov lr, r12, d1
-; CHECK-LV-NEXT: vmov.f32 s2, s12
-; CHECK-LV-NEXT: vmov.f32 s3, s13
-; CHECK-LV-NEXT: vmov r6, r7, d12
-; CHECK-LV-NEXT: adds.w r0, r2, lr
-; CHECK-LV-NEXT: adc.w r2, r3, r12
-; CHECK-LV-NEXT: adds.w lr, r0, r4
-; CHECK-LV-NEXT: vmov r3, r5, d8
-; CHECK-LV-NEXT: adc.w r12, r2, r8
-; CHECK-LV-NEXT: vmov r2, r0, d10
-; CHECK-LV-NEXT: adds r3, r3, r6
-; CHECK-LV-NEXT: adcs r7, r5
-; CHECK-LV-NEXT: adds r2, r2, r3
-; CHECK-LV-NEXT: adc.w r8, r7, r0
-; CHECK-LV-NEXT: vmov r6, r5, d1
-; CHECK-LV-NEXT: vmov r3, r7, d3
-; CHECK-LV-NEXT: vmov r4, r0, d0
-; CHECK-LV-NEXT: adds r3, r3, r6
-; CHECK-LV-NEXT: adcs r7, r5
-; CHECK-LV-NEXT: vmov r6, r5, d7
-; CHECK-LV-NEXT: adds r3, r3, r6
-; CHECK-LV-NEXT: adcs r7, r5
-; CHECK-LV-NEXT: vmov r6, r5, d2
-; CHECK-LV-NEXT: vmov q1[2], q1[0], r2, r3
-; CHECK-LV-NEXT: vmov q1[3], q1[1], r8, r7
-; CHECK-LV-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-LV-NEXT: adds r4, r4, r6
-; CHECK-LV-NEXT: adcs r0, r5
-; CHECK-LV-NEXT: vmov r5, r6, d4
-; CHECK-LV-NEXT: adds r4, r4, r5
-; CHECK-LV-NEXT: vmov q0[2], q0[0], r4, lr
-; CHECK-LV-NEXT: adcs r0, r6
-; CHECK-LV-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-LV-NEXT: vstrw.32 q0, [r1]
-; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12}
-; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
-;
-; CHECK-LIS-LABEL: vld3_v4i64:
-; CHECK-LIS: @ %bb.0: @ %entry
-; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12}
-; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12}
-; CHECK-LIS-NEXT: vldrw.u32 q0, [r0]
-; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32]
-; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #48]
-; CHECK-LIS-NEXT: vmov.f32 s4, s2
-; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #64]
-; CHECK-LIS-NEXT: vmov.f32 s5, s3
-; CHECK-LIS-NEXT: vmov.f32 s2, s12
-; CHECK-LIS-NEXT: vmov.f32 s3, s13
-; CHECK-LIS-NEXT: vmov r5, r4, d5
-; CHECK-LIS-NEXT: vmov r3, r8, d7
-; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80]
-; CHECK-LIS-NEXT: vmov.f32 s24, s18
-; CHECK-LIS-NEXT: vmov.f32 s25, s19
-; CHECK-LIS-NEXT: vmov.f32 s6, s22
-; CHECK-LIS-NEXT: vmov.f32 s7, s23
-; CHECK-LIS-NEXT: vmov lr, r12, d1
-; CHECK-LIS-NEXT: vmov.f32 s2, s12
-; CHECK-LIS-NEXT: vmov.f32 s3, s13
-; CHECK-LIS-NEXT: vmov r7, r6, d12
-; CHECK-LIS-NEXT: adds.w r0, r5, lr
-; CHECK-LIS-NEXT: adc.w r5, r4, r12
-; CHECK-LIS-NEXT: adds.w lr, r0, r3
-; CHECK-LIS-NEXT: vmov r4, r2, d8
-; CHECK-LIS-NEXT: adc.w r12, r5, r8
-; CHECK-LIS-NEXT: vmov r5, r0, d10
-; CHECK-LIS-NEXT: adds r7, r7, r4
-; CHECK-LIS-NEXT: adcs r2, r6
-; CHECK-LIS-NEXT: adds r7, r7, r5
-; CHECK-LIS-NEXT: adc.w r8, r2, r0
-; CHECK-LIS-NEXT: vmov r6, r4, d1
-; CHECK-LIS-NEXT: vmov r2, r5, d3
-; CHECK-LIS-NEXT: vmov r3, r0, d0
-; CHECK-LIS-NEXT: adds r2, r2, r6
-; CHECK-LIS-NEXT: adc.w r6, r5, r4
-; CHECK-LIS-NEXT: vmov r5, r4, d7
-; CHECK-LIS-NEXT: adds r2, r2, r5
-; CHECK-LIS-NEXT: adcs r6, r4
-; CHECK-LIS-NEXT: vmov r5, r4, d2
-; CHECK-LIS-NEXT: vmov q1[2], q1[0], r7, r2
-; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r6
-; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-LIS-NEXT: adds r3, r3, r5
-; CHECK-LIS-NEXT: adcs r0, r4
-; CHECK-LIS-NEXT: vmov r4, r5, d4
-; CHECK-LIS-NEXT: adds r3, r3, r4
-; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, lr
-; CHECK-LIS-NEXT: adcs r0, r5
-; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-LIS-NEXT: vstrw.32 q0, [r1]
-; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12}
-; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-LABEL: vld3_v4i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vmov lr, r12, d2
+; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
+; CHECK-NEXT: vmov r3, r2, d1
+; CHECK-NEXT: vmov r4, r8, d3
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov r6, r7, d9
+; CHECK-NEXT: adds.w r0, r3, lr
+; CHECK-NEXT: vmov r3, r5, d8
+; CHECK-NEXT: adc.w r2, r2, r12
+; CHECK-NEXT: adds.w lr, r0, r4
+; CHECK-NEXT: adc.w r12, r2, r8
+; CHECK-NEXT: vmov r2, r0, d6
+; CHECK-NEXT: adds r3, r3, r6
+; CHECK-NEXT: adcs r7, r5
+; CHECK-NEXT: vmov r6, r5, d4
+; CHECK-NEXT: adds r2, r2, r3
+; CHECK-NEXT: adc.w r8, r7, r0
+; CHECK-NEXT: vmov r3, r7, d7
+; CHECK-NEXT: vmov r4, r0, d2
+; CHECK-NEXT: adds r3, r3, r6
+; CHECK-NEXT: adcs r7, r5
+; CHECK-NEXT: vmov r6, r5, d5
+; CHECK-NEXT: adds r3, r3, r6
+; CHECK-NEXT: adcs r7, r5
+; CHECK-NEXT: vmov r6, r5, d3
+; CHECK-NEXT: vmov q1[2], q1[0], r2, r3
+; CHECK-NEXT: vmov q1[3], q1[1], r8, r7
+; CHECK-NEXT: vstrw.32 q1, [r1, #16]
+; CHECK-NEXT: adds r4, r4, r6
+; CHECK-NEXT: adcs r0, r5
+; CHECK-NEXT: vmov r5, r6, d0
+; CHECK-NEXT: adds r4, r4, r5
+; CHECK-NEXT: vmov q0[2], q0[0], r4, lr
+; CHECK-NEXT: adcs r0, r6
+; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: vpop {d8, d9}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%l1 = load <12 x i64>, ptr %src, align 4
%s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
index 1adc1269feab5..843140e0882d0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
@@ -92,44 +92,33 @@ define ptr @vld4_v2i64(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vmov r2, r12, d1
+; CHECK-NEXT: vmov r3, lr, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: adds r2, r2, r3
+; CHECK-NEXT: vmov r3, r6, d0
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov.f32 s5, s3
-; CHECK-NEXT: vmov.f32 s2, s8
-; CHECK-NEXT: vmov.f32 s3, s9
-; CHECK-NEXT: vmov lr, r12, d5
-; CHECK-NEXT: vldrw.u32 q2, [r0], #64
-; CHECK-NEXT: vmov r4, r8, d9
-; CHECK-NEXT: vmov.f32 s12, s10
-; CHECK-NEXT: vmov.f32 s13, s11
-; CHECK-NEXT: vmov r2, r7, d1
-; CHECK-NEXT: vmov.f32 s2, s16
-; CHECK-NEXT: vmov.f32 s3, s17
-; CHECK-NEXT: vmov r3, r6, d1
-; CHECK-NEXT: adds.w r2, r2, lr
-; CHECK-NEXT: adc.w r7, r7, r12
+; CHECK-NEXT: adc.w r12, r12, lr
; CHECK-NEXT: adds r3, r3, r4
-; CHECK-NEXT: vmov r4, r5, d2
-; CHECK-NEXT: adc.w r6, r6, r8
-; CHECK-NEXT: adds.w r12, r3, r2
-; CHECK-NEXT: vmov r3, r2, d0
-; CHECK-NEXT: adc.w lr, r6, r7
+; CHECK-NEXT: vmov r4, lr, d1
+; CHECK-NEXT: adcs r5, r6
+; CHECK-NEXT: adds.w r8, r3, r2
+; CHECK-NEXT: vmov r3, r6, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0], #64
+; CHECK-NEXT: adc.w r12, r12, r5
+; CHECK-NEXT: vmov r7, r5, d0
; CHECK-NEXT: adds r3, r3, r4
-; CHECK-NEXT: vmov r6, r4, d6
+; CHECK-NEXT: vmov r4, r2, d1
+; CHECK-NEXT: adc.w r6, r6, lr
+; CHECK-NEXT: adds r7, r7, r4
; CHECK-NEXT: adcs r2, r5
-; CHECK-NEXT: vmov r5, r7, d4
-; CHECK-NEXT: adds r5, r5, r6
-; CHECK-NEXT: adcs r4, r7
-; CHECK-NEXT: adds r3, r3, r5
-; CHECK-NEXT: adcs r2, r4
-; CHECK-NEXT: vmov q0[2], q0[0], r3, r12
-; CHECK-NEXT: vmov q0[3], q0[1], r2, lr
+; CHECK-NEXT: adds r3, r3, r7
+; CHECK-NEXT: adcs r2, r6
+; CHECK-NEXT: vmov q0[2], q0[0], r3, r8
+; CHECK-NEXT: vmov q0[3], q0[1], r2, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%l1 = load <8 x i64>, ptr %src, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index ab41069bfa258..3bee5eb86695e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -6,28 +6,22 @@
define void @vld4_v2i32(ptr %src, ptr %dst) {
; CHECK-LABEL: vld4_v2i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vmov r7, r3, d0
+; CHECK-NEXT: vmov r12, lr, d1
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov.f32 s8, s3
-; CHECK-NEXT: vmov.f32 s12, s1
-; CHECK-NEXT: vmov r0, s10
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: vmov r2, s6
-; CHECK-NEXT: add r2, r3
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: add.w r12, r2, r0
-; CHECK-NEXT: vmov r2, s8
-; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vmov r0, r4, d1
+; CHECK-NEXT: vmov r5, r6, d0
+; CHECK-NEXT: add r3, r7
+; CHECK-NEXT: add.w r2, r12, lr
; CHECK-NEXT: add r2, r3
-; CHECK-NEXT: vmov r3, s12
+; CHECK-NEXT: add r0, r4
+; CHECK-NEXT: adds r3, r5, r6
; CHECK-NEXT: add r0, r3
-; CHECK-NEXT: add r0, r2
-; CHECK-NEXT: strd r0, r12, [r1]
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: strd r0, r2, [r1]
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%l1 = load <8 x i32>, ptr %src, align 4
%s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
@@ -585,47 +579,36 @@ entry:
define void @vld4_v2i64(ptr %src, ptr %dst) {
; CHECK-LABEL: vld4_v2i64:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: .vsave {d8, d9}
-; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vmov.f32 s4, s2
-; CHECK-NEXT: vmov.f32 s5, s3
-; CHECK-NEXT: vmov.f32 s2, s8
-; CHECK-NEXT: vmov.f32 s3, s9
-; CHECK-NEXT: vmov lr, r12, d5
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vmov r0, r8, d9
-; CHECK-NEXT: vmov.f32 s12, s10
-; CHECK-NEXT: vmov.f32 s13, s11
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vmov.f32 s2, s16
-; CHECK-NEXT: vmov.f32 s3, s17
-; CHECK-NEXT: vmov r5, r6, d1
-; CHECK-NEXT: adds.w r2, r2, lr
-; CHECK-NEXT: adc.w r3, r3, r12
-; CHECK-NEXT: vmov r4, r12, d2
-; CHECK-NEXT: adds r0, r0, r5
-; CHECK-NEXT: vmov r5, r7, d0
-; CHECK-NEXT: adc.w r6, r6, r8
-; CHECK-NEXT: adds r0, r0, r2
-; CHECK-NEXT: adc.w lr, r6, r3
-; CHECK-NEXT: vmov r3, r6, d6
-; CHECK-NEXT: adds r5, r5, r4
-; CHECK-NEXT: vmov r4, r2, d4
-; CHECK-NEXT: adc.w r7, r7, r12
+; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT: vmov lr, r12, d1
+; CHECK-NEXT: vmov r3, r2, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r4, r7, d3
+; CHECK-NEXT: vmov r0, r6, d2
+; CHECK-NEXT: adds.w r3, r3, lr
+; CHECK-NEXT: adc.w r2, r2, r12
+; CHECK-NEXT: vmov r5, r12, d5
+; CHECK-NEXT: adds r0, r0, r4
+; CHECK-NEXT: adcs r7, r6
+; CHECK-NEXT: vmov r6, r4, d4
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adc.w lr, r7, r2
+; CHECK-NEXT: vmov r3, r7, d1
+; CHECK-NEXT: adds r6, r6, r5
+; CHECK-NEXT: adc.w r5, r4, r12
+; CHECK-NEXT: vmov r4, r2, d0
; CHECK-NEXT: adds r3, r3, r4
-; CHECK-NEXT: adcs r2, r6
-; CHECK-NEXT: adds r3, r3, r5
; CHECK-NEXT: adcs r2, r7
+; CHECK-NEXT: adds r3, r3, r6
+; CHECK-NEXT: adcs r2, r5
; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
; CHECK-NEXT: vmov q0[3], q0[1], r2, lr
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vpop {d8, d9}
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%l1 = load <8 x i64>, ptr %src, align 8
%s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
@@ -644,84 +627,60 @@ define void @vld4_v4i64(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #80]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s8, s2
-; CHECK-NEXT: vmov.f32 s9, s3
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT: vmov.f32 s2, s20
-; CHECK-NEXT: vldrw.u32 q7, [r0, #112]
-; CHECK-NEXT: vmov.f32 s3, s21
-; CHECK-NEXT: vmov r3, r2, d11
-; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT: vmov lr, r12, d9
-; CHECK-NEXT: vmov.f32 s0, s26
-; CHECK-NEXT: vmov.f32 s1, s27
-; CHECK-NEXT: vmov.f32 s12, s6
-; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT: vmov r2, r12, d1
+; CHECK-NEXT: vmov r3, lr, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: vmov r4, r7, d1
+; CHECK-NEXT: adds r2, r2, r3
+; CHECK-NEXT: vmov r3, r6, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT: adc.w r5, lr, r12
+; CHECK-NEXT: adds r3, r3, r4
+; CHECK-NEXT: vmov r4, r8, d1
+; CHECK-NEXT: adcs r7, r6
+; CHECK-NEXT: adds.w lr, r3, r2
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT: adc.w r12, r7, r5
+; CHECK-NEXT: vmov r6, r7, d0
+; CHECK-NEXT: adds r2, r2, r4
; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: vmov.f32 s2, s16
-; CHECK-NEXT: vmov.f32 s3, s17
-; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT: vmov.f32 s6, s28
-; CHECK-NEXT: vmov.f32 s7, s29
-; CHECK-NEXT: vmov.f32 s10, s20
-; CHECK-NEXT: vmov.f32 s11, s21
+; CHECK-NEXT: adc.w r3, r3, r8
+; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
+; CHECK-NEXT: adds r4, r4, r6
+; CHECK-NEXT: adcs r5, r7
+; CHECK-NEXT: adds.w r8, r4, r2
+; CHECK-NEXT: adc.w r9, r5, r3
+; CHECK-NEXT: vmov r4, r6, d1
+; CHECK-NEXT: vmov r5, r7, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: adds r4, r4, r5
+; CHECK-NEXT: adc.w r5, r7, r6
; CHECK-NEXT: vmov r6, r7, d1
-; CHECK-NEXT: adds r3, r3, r4
-; CHECK-NEXT: adc.w r4, r5, r2
-; CHECK-NEXT: vmov r5, r8, d0
-; CHECK-NEXT: vmov.f32 s0, s18
-; CHECK-NEXT: vmov.f32 s1, s19
-; CHECK-NEXT: adds.w r2, r6, lr
-; CHECK-NEXT: vmov r6, r0, d12
-; CHECK-NEXT: adc.w r7, r7, r12
-; CHECK-NEXT: adds.w lr, r2, r3
-; CHECK-NEXT: adc.w r12, r7, r4
-; CHECK-NEXT: vmov r7, r4, d0
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: adds r6, r6, r5
-; CHECK-NEXT: vmov r5, r3, d8
-; CHECK-NEXT: adc.w r0, r0, r8
-; CHECK-NEXT: adds r7, r7, r5
-; CHECK-NEXT: adcs r3, r4
-; CHECK-NEXT: adds.w r9, r7, r6
-; CHECK-NEXT: adc.w r8, r3, r0
-; CHECK-NEXT: vmov r5, r4, d15
-; CHECK-NEXT: vmov r3, r6, d3
-; CHECK-NEXT: vmov r2, r0, d5
-; CHECK-NEXT: adds r3, r3, r5
-; CHECK-NEXT: adcs r6, r4
-; CHECK-NEXT: vmov r5, r4, d11
-; CHECK-NEXT: adds r2, r2, r5
-; CHECK-NEXT: adcs r0, r4
-; CHECK-NEXT: adds r2, r2, r3
-; CHECK-NEXT: adc.w r10, r0, r6
-; CHECK-NEXT: vmov r3, r4, d4
-; CHECK-NEXT: vmov r5, r6, d0
-; CHECK-NEXT: vmov r7, r0, d2
-; CHECK-NEXT: vmov q1[2], q1[0], r9, r2
-; CHECK-NEXT: vmov q1[3], q1[1], r8, r10
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r2, r2, r6
+; CHECK-NEXT: adcs r3, r7
+; CHECK-NEXT: adds.w r10, r2, r4
+; CHECK-NEXT: adcs r3, r5
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: vmov r6, r7, d0
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vmov q1[2], q1[0], r8, r10
+; CHECK-NEXT: vmov q1[3], q1[1], r9, r3
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
-; CHECK-NEXT: adds r3, r3, r5
-; CHECK-NEXT: adcs r4, r6
-; CHECK-NEXT: vmov r5, r6, d6
-; CHECK-NEXT: adds r5, r5, r7
-; CHECK-NEXT: adcs r0, r6
-; CHECK-NEXT: adds r3, r3, r5
-; CHECK-NEXT: vmov q0[2], q0[0], r3, lr
-; CHECK-NEXT: adcs r0, r4
+; CHECK-NEXT: adds r4, r4, r6
+; CHECK-NEXT: vmov r0, r6, d1
+; CHECK-NEXT: adcs r5, r7
+; CHECK-NEXT: vmov r7, r2, d0
+; CHECK-NEXT: adds r0, r0, r7
+; CHECK-NEXT: adcs r2, r6
+; CHECK-NEXT: adds r0, r0, r4
+; CHECK-NEXT: vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT: adc.w r0, r5, r2
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: add sp, #16
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
entry:
%l1 = load <16 x i64>, ptr %src, align 8
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index cebc0d9c0e172..bc023cd28a1d1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -132,9 +132,8 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: umull lr, r12, r1, r0
; CHECK-NEXT: umull r2, r5, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
@@ -162,10 +161,9 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vrev64.32 q1, q0
+; CHECK-NEXT: vmov r1, s3
; CHECK-NEXT: asrs r4, r0, #31
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: umull lr, r12, r0, r1
; CHECK-NEXT: umull r2, r5, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
@@ -240,36 +238,32 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_0213_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov.f32 s4, s1
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r2, r5, r3, r0
+; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: vmov r1, r2, d1
+; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: umull lr, r12, r1, r0
-; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: mla r4, r1, r2, r12
+; CHECK-NEXT: umull r5, r7, r3, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r5, lr
+; CHECK-NEXT: asrs r5, r0, #31
+; CHECK-NEXT: mla r6, r1, r5, r12
; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r5, r3, r2, r5
+; CHECK-NEXT: mla r7, r3, r5, r7
; CHECK-NEXT: asrs r3, r3, #31
-; CHECK-NEXT: mla r1, r1, r0, r4
-; CHECK-NEXT: mla r3, r3, r0, r5
-; CHECK-NEXT: vmov q1[3], q1[1], r3, r1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: umull r3, r5, r1, r0
-; CHECK-NEXT: mla r5, r1, r2, r5
-; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r12, r1, r0, r5
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: umull r4, r1, r5, r0
-; CHECK-NEXT: mla r1, r5, r2, r1
-; CHECK-NEXT: asrs r2, r5, #31
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT: mla r0, r2, r0, r1
-; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: mla r1, r1, r0, r6
+; CHECK-NEXT: mla r3, r3, r0, r7
+; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT: umull r1, r3, r2, r0
+; CHECK-NEXT: umull r7, r6, r4, r0
+; CHECK-NEXT: vmov q1[2], q1[0], r7, r1
+; CHECK-NEXT: mla r1, r2, r5, r3
+; CHECK-NEXT: asrs r2, r2, #31
+; CHECK-NEXT: asrs r3, r4, #31
+; CHECK-NEXT: mla r1, r2, r0, r1
+; CHECK-NEXT: mla r2, r4, r5, r6
+; CHECK-NEXT: mla r0, r3, r0, r2
+; CHECK-NEXT: vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = sext <4 x i32> %shuf1 to <4 x i64>
@@ -283,36 +277,32 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_0213:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov.f32 s4, s1
-; CHECK-NEXT: asrs r4, r0, #31
-; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov r3, s4
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r2, r5, r0, r3
+; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NEXT: vmov r1, r2, d1
+; CHECK-NEXT: asrs r6, r0, #31
+; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: umull lr, r12, r0, r1
-; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: mla r2, r0, r2, r12
-; CHECK-NEXT: mla r1, r4, r1, r2
-; CHECK-NEXT: asrs r2, r3, #31
-; CHECK-NEXT: mla r2, r0, r2, r5
-; CHECK-NEXT: mla r2, r4, r3, r2
-; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: umull r2, r3, r0, r1
+; CHECK-NEXT: umull r5, r7, r0, r3
+; CHECK-NEXT: vmov q0[2], q0[0], r5, lr
; CHECK-NEXT: asrs r5, r1, #31
-; CHECK-NEXT: mla r3, r0, r5, r3
-; CHECK-NEXT: mla r12, r4, r1, r3
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: umull r5, r1, r0, r3
-; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT: asrs r2, r3, #31
-; CHECK-NEXT: mla r0, r0, r2, r1
-; CHECK-NEXT: mla r0, r4, r3, r0
-; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: mla r5, r0, r5, r12
+; CHECK-NEXT: mla r1, r6, r1, r5
+; CHECK-NEXT: asrs r5, r3, #31
+; CHECK-NEXT: mla r7, r0, r5, r7
+; CHECK-NEXT: mla r3, r6, r3, r7
+; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT: umull r1, r3, r0, r2
+; CHECK-NEXT: umull r7, r5, r0, r4
+; CHECK-NEXT: vmov q1[2], q1[0], r7, r1
+; CHECK-NEXT: asrs r1, r2, #31
+; CHECK-NEXT: mla r1, r0, r1, r3
+; CHECK-NEXT: mla r1, r6, r2, r1
+; CHECK-NEXT: asrs r2, r4, #31
+; CHECK-NEXT: mla r0, r0, r2, r5
+; CHECK-NEXT: mla r0, r6, r4, r0
+; CHECK-NEXT: vmov q1[3], q1[1], r0, r1
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = sext <4 x i32> %shuf1 to <4 x i64>
@@ -434,9 +424,8 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @zext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: zext32_1357_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: umull r1, r2, r1, r0
; CHECK-NEXT: umull r0, r3, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
@@ -455,9 +444,8 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @zext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: zext32_ext0_1357:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov r1, s3
+; CHECK-NEXT: vmov r3, s1
; CHECK-NEXT: umull r1, r2, r0, r1
; CHECK-NEXT: umull r0, r3, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
@@ -526,22 +514,19 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @zext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: zext32_0213_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov.f32 s0, s1
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: umull r1, r12, r1, r0
-; CHECK-NEXT: umull r3, r2, r3, r0
-; CHECK-NEXT: vmov q2[2], q2[0], r3, r1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov q2[3], q2[1], r2, r12
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: umull r1, r2, r1, r0
-; CHECK-NEXT: umull r0, r3, r3, r0
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: vmov r1, r12, d1
+; CHECK-NEXT: vmov r2, lr, d0
+; CHECK-NEXT: umull r1, r3, r1, r0
+; CHECK-NEXT: umull r2, r4, r2, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT: umull r1, r2, r12, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT: umull r0, r3, lr, r0
; CHECK-NEXT: vmov q1[2], q1[0], r0, r1
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = zext <4 x i32> %shuf1 to <4 x i64>
@@ -555,22 +540,19 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @zext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: zext32_ext0_0213:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov.f32 s0, s1
-; CHECK-NEXT: vmov.f32 s2, s3
-; CHECK-NEXT: umull r1, r12, r0, r1
-; CHECK-NEXT: umull r3, r2, r0, r3
-; CHECK-NEXT: vmov q2[2], q2[0], r3, r1
-; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: vmov q2[3], q2[1], r2, r12
-; CHECK-NEXT: vmov q0, q2
-; CHECK-NEXT: umull r1, r2, r0, r1
-; CHECK-NEXT: umull r0, r3, r0, r3
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: vmov r1, r12, d1
+; CHECK-NEXT: vmov r2, lr, d0
+; CHECK-NEXT: umull r1, r3, r0, r1
+; CHECK-NEXT: umull r2, r4, r0, r2
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
+; CHECK-NEXT: umull r1, r2, r0, r12
+; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
+; CHECK-NEXT: umull r0, r3, r0, lr
; CHECK-NEXT: vmov q1[2], q1[0], r0, r1
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = zext <4 x i32> %shuf1 to <4 x i64>
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index ff416dbe3f1a0..ed7ba3648200b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -8,18 +8,18 @@ define void @vst3_v2i32(ptr %src, ptr %dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: ldrd lr, r12, [r0]
-; CHECK-NEXT: ldrd r3, r2, [r0, #8]
+; CHECK-NEXT: ldrd r12, r3, [r0]
+; CHECK-NEXT: ldrd lr, r2, [r0, #8]
; CHECK-NEXT: ldrd r4, r0, [r0, #16]
-; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
+; CHECK-NEXT: vmov q1[2], q1[0], r12, r3
+; CHECK-NEXT: vmov q2[2], q2[0], lr, r2
+; CHECK-NEXT: vmov.f32 s12, s4
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT: vmov.f32 s13, s8
+; CHECK-NEXT: vmov.f32 s14, s0
; CHECK-NEXT: strd r2, r0, [r1, #16]
-; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
-; CHECK-NEXT: vmov.32 q0[0], r4
-; CHECK-NEXT: vmov.f32 s8, s4
-; CHECK-NEXT: vmov.f32 s9, s6
-; CHECK-NEXT: vmov.f32 s10, s0
-; CHECK-NEXT: vmov.f32 s11, s5
-; CHECK-NEXT: vstrw.32 q2, [r1]
+; CHECK-NEXT: vmov.f32 s15, s6
+; CHECK-NEXT: vstrw.32 q3, [r1]
; CHECK-NEXT: pop {r4, pc}
entry:
%l1 = load <2 x i32>, ptr %src, align 4
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
index 4c30a3adf2378..46128c19e8e0b 100644
--- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -480,18 +480,17 @@ define i64 @pairwise_umax_v2i64(<2 x i64> %arg) {
; SIMD128-LABEL: pairwise_umax_v2i64:
; SIMD128: .functype pairwise_umax_v2i64 (v128) -> (i64)
; SIMD128-NEXT: # %bb.0:
-; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-; SIMD128-NEXT: local.tee $push9=, $1=, $pop10
-; SIMD128-NEXT: i64.const $push4=, -1
-; SIMD128-NEXT: i64.const $push3=, 0
-; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0
-; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0
-; SIMD128-NEXT: i64.gt_u $push2=, $pop1, $pop0
-; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2
-; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5
-; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6
-; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0
-; SIMD128-NEXT: return $pop8
+; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT: i64.const $push5=, -1
+; SIMD128-NEXT: i64.const $push4=, 0
+; SIMD128-NEXT: i64x2.extract_lane $push2=, $0, 0
+; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 1
+; SIMD128-NEXT: i64.gt_u $push3=, $pop2, $pop1
+; SIMD128-NEXT: i64.select $push6=, $pop5, $pop4, $pop3
+; SIMD128-NEXT: i64x2.replace_lane $push7=, $0, 0, $pop6
+; SIMD128-NEXT: v128.bitselect $push8=, $0, $pop0, $pop7
+; SIMD128-NEXT: i64x2.extract_lane $push9=, $pop8, 0
+; SIMD128-NEXT: return $pop9
%res = tail call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %arg)
ret i64 %res
}
@@ -554,18 +553,17 @@ define i64 @pairwise_umin_v2i64(<2 x i64> %arg) {
; SIMD128-LABEL: pairwise_umin_v2i64:
; SIMD128: .functype pairwise_umin_v2i64 (v128) -> (i64)
; SIMD128-NEXT: # %bb.0:
-; SIMD128-NEXT: i8x16.shuffle $push10=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-; SIMD128-NEXT: local.tee $push9=, $1=, $pop10
-; SIMD128-NEXT: i64.const $push4=, -1
-; SIMD128-NEXT: i64.const $push3=, 0
-; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 0
-; SIMD128-NEXT: i64x2.extract_lane $push0=, $1, 0
-; SIMD128-NEXT: i64.lt_u $push2=, $pop1, $pop0
-; SIMD128-NEXT: i64.select $push5=, $pop4, $pop3, $pop2
-; SIMD128-NEXT: i64x2.replace_lane $push6=, $0, 0, $pop5
-; SIMD128-NEXT: v128.bitselect $push7=, $0, $pop9, $pop6
-; SIMD128-NEXT: i64x2.extract_lane $push8=, $pop7, 0
-; SIMD128-NEXT: return $pop8
+; SIMD128-NEXT: i8x16.shuffle $push0=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
+; SIMD128-NEXT: i64.const $push5=, -1
+; SIMD128-NEXT: i64.const $push4=, 0
+; SIMD128-NEXT: i64x2.extract_lane $push2=, $0, 0
+; SIMD128-NEXT: i64x2.extract_lane $push1=, $0, 1
+; SIMD128-NEXT: i64.lt_u $push3=, $pop2, $pop1
+; SIMD128-NEXT: i64.select $push6=, $pop5, $pop4, $pop3
+; SIMD128-NEXT: i64x2.replace_lane $push7=, $0, 0, $pop6
+; SIMD128-NEXT: v128.bitselect $push8=, $0, $pop0, $pop7
+; SIMD128-NEXT: i64x2.extract_lane $push9=, $pop8, 0
+; SIMD128-NEXT: return $pop9
%res = tail call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %arg)
ret i64 %res
}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 316e3f27a0a1f..10683a77bb5ae 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1477,37 +1477,33 @@ define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
;
; X64-NOVL-LABEL: movsh:
; X64-NOVL: # %bb.0:
-; X64-NOVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; X64-NOVL-NEXT: vmovsh {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
-; X64-NOVL-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,3,3,3]
-; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NOVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; X64-NOVL-NEXT: vaddsh %xmm5, %xmm0, %xmm0
-; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0]
-; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
-; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm4
-; X64-NOVL-NEXT: vpsrlq $48, %xmm2, %xmm5
-; X64-NOVL-NEXT: vaddsh %xmm4, %xmm5, %xmm4
-; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vaddsh %xmm2, %xmm3, %xmm2
+; X64-NOVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vaddsh %xmm3, %xmm4, %xmm3
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NOVL-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NOVL-NEXT: vpsrlq $48, %xmm0, %xmm5
+; X64-NOVL-NEXT: vaddsh %xmm3, %xmm5, %xmm3
+; X64-NOVL-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
; X64-NOVL-NEXT: vaddsh %xmm5, %xmm6, %xmm5
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-NOVL-NEXT: vaddsh %xmm3, %xmm2, %xmm3
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NOVL-NEXT: vpsrlq $48, %xmm1, %xmm3
+; X64-NOVL-NEXT: vpsrld $16, %xmm0, %xmm5
+; X64-NOVL-NEXT: vaddsh %xmm3, %xmm5, %xmm3
+; X64-NOVL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; X64-NOVL-NEXT: vaddsh %xmm5, %xmm0, %xmm5
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; X64-NOVL-NEXT: vpsrld $16, %xmm1, %xmm1
-; X64-NOVL-NEXT: vpsrld $16, %xmm2, %xmm2
-; X64-NOVL-NEXT: vaddsh %xmm1, %xmm2, %xmm1
-; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NOVL-NEXT: vaddsh %xmm1, %xmm4, %xmm1
+; X64-NOVL-NEXT: vaddsh %xmm0, %xmm0, %xmm0
+; X64-NOVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NOVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X64-NOVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NOVL-NEXT: retq
%res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
%res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index ab28a3b4a2b63..1daaa20e99f58 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -64,9 +64,9 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
;
; CHECK-X64-LABEL: fail:
; CHECK-X64: # %bb.0:
-; CHECK-X64-NEXT: pslld $8, %xmm0
; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-X64-NEXT: movd %xmm0, %eax
+; CHECK-X64-NEXT: shrl $8, %eax
; CHECK-X64-NEXT: xorb $1, %al
; CHECK-X64-NEXT: testl $263, %edi # imm = 0x107
; CHECK-X64-NEXT: setne %cl
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index a54ff67f74755..3e15c6d30c020 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -2983,7 +2983,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movw %dx, 3(%rdi)
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: movb %al, 2(%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movw %ax, 9(%rdi)
; SSE2-NEXT: shrl $16, %ecx
@@ -3038,7 +3038,7 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movw %dx, 3(%rdi)
; SSSE3-NEXT: shrl $16, %eax
; SSSE3-NEXT: movb %al, 2(%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: movw %ax, 9(%rdi)
; SSSE3-NEXT: shrl $16, %ecx
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 62db6d234d301..2cb50d4c721b4 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2642,7 +2642,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; SSE2-NEXT: movw %dx, 3(%rdi)
; SSE2-NEXT: shrl $16, %eax
; SSE2-NEXT: movb %al, 2(%rdi)
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movw %ax, 9(%rdi)
; SSE2-NEXT: shrl $16, %ecx
@@ -2685,7 +2685,7 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
; SSSE3-NEXT: movw %dx, 3(%rdi)
; SSSE3-NEXT: shrl $16, %eax
; SSSE3-NEXT: movb %al, 2(%rdi)
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSSE3-NEXT: movd %xmm1, %eax
; SSSE3-NEXT: movw %ax, 9(%rdi)
; SSSE3-NEXT: shrl $16, %ecx
More information about the llvm-commits
mailing list